Add cocktail party spatial filtering (#7)

audio_stream.py: Added focus_side property. When set, the stream
yields from the focused side regardless of energy (attention lock).
When None, falls back to energy-based auto selection.

multi_speaker.py: When beams lock onto 2 speakers, sets audio focus
to the target speaker's side. Auto-switches target when the current
target goes silent and the other starts talking. Manual focus via API.

headmic.py: New endpoint POST /speakers/focus?speaker=0|1 to manually
switch attention. /speakers/tracked now shows is_target, target_speaker,
and audio_focus fields.

The cocktail party effect: when 2 people are talking, the audio feed
to Porcupine/VAD/transcription comes from the target speaker's direction,
suppressing the other. XVF3800 beam gating silences the non-speaking beam,
and audio_stream focus locks the ear facing the target.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex
2026-04-12 21:47:30 -05:00
parent 38d21ef53c
commit 0705b3818b
3 changed files with 64 additions and 8 deletions

View File

@@ -117,6 +117,7 @@ class DualAudioStream:
self.left = MicStream("left", left_device) self.left = MicStream("left", left_device)
self.right = MicStream("right", right_device) if right_device else None self.right = MicStream("right", right_device) if right_device else None
self.active_side: str = "left" self.active_side: str = "left"
self.focus_side: Optional[str] = None # None=auto (energy), "left"/"right"=locked attention
self._running = False self._running = False
def start(self): def start(self):
@@ -162,18 +163,25 @@ class DualAudioStream:
last_frame_left = frame_left last_frame_left = frame_left
last_frame_right = frame_right last_frame_right = frame_right
# Pick best beam # Pick beam: focused attention overrides energy-based selection
if frame_right is None: if frame_right is None:
self.active_side = "left" self.active_side = "left"
yield frame_left, "left" yield frame_left, "left"
elif self.focus_side:
# Cocktail party mode: locked onto a specific side
self.active_side = self.focus_side
if self.focus_side == "right" and frame_right:
yield frame_right, "right"
else:
yield frame_left, "left"
else: else:
# Auto mode: pick higher-energy side
left_energy = self.left.get_energy() left_energy = self.left.get_energy()
right_energy = self.right.get_energy() right_energy = self.right.get_energy()
if right_energy > left_energy * 1.1: # 10% hysteresis if right_energy > left_energy * 1.1:
self.active_side = "right" self.active_side = "right"
elif left_energy > right_energy * 1.1: elif left_energy > right_energy * 1.1:
self.active_side = "left" self.active_side = "left"
# else: keep current active_side (hysteresis prevents flapping)
if self.active_side == "right" and frame_right: if self.active_side == "right" and frame_right:
yield frame_right, "right" yield frame_right, "right"

View File

@@ -588,8 +588,8 @@ async def startup():
# --- Multi-speaker tracking --- # --- Multi-speaker tracking ---
if xvf_manager.left or xvf_manager.right: if xvf_manager.left or xvf_manager.right:
from multi_speaker import MultiSpeakerTracker from multi_speaker import MultiSpeakerTracker
multi_speaker = MultiSpeakerTracker(xvf_manager) multi_speaker = MultiSpeakerTracker(xvf_manager, audio_stream=dual_stream)
logger.info("Multi-speaker tracking enabled (2 beams per array)") logger.info("Multi-speaker tracking enabled (2 beams per array, cocktail party filtering)")
# --- Binaural recording --- # --- Binaural recording ---
if os.environ.get("BINAURAL_RECORD", "").lower() in ("1", "true", "yes"): if os.environ.get("BINAURAL_RECORD", "").lower() in ("1", "true", "yes"):
@@ -692,6 +692,19 @@ async def tracked_speakers():
return state.multi_speaker return state.multi_speaker
@app.post("/speakers/focus")
async def focus_speaker(speaker: int = 0):
"""Switch attention to a specific tracked speaker (0 or 1).
In cocktail party mode, the focused speaker's audio feeds wake word + transcription."""
if not multi_speaker:
raise HTTPException(status_code=503, detail="Multi-speaker tracking not available")
if speaker not in (0, 1):
raise HTTPException(status_code=400, detail="Speaker index must be 0 or 1")
multi_speaker.target_speaker_idx = speaker
multi_speaker._update_audio_focus()
return {"ok": True, "target_speaker": speaker}
# --- Spatial scene --- # --- Spatial scene ---
@app.get("/scene") @app.get("/scene")

View File

@@ -80,10 +80,12 @@ class TrackedSpeaker:
class MultiSpeakerTracker: class MultiSpeakerTracker:
"""Track multiple speakers and manage beam steering.""" """Track multiple speakers and manage beam steering."""
def __init__(self, xvf_manager): def __init__(self, xvf_manager, audio_stream=None):
self.xvf = xvf_manager self.xvf = xvf_manager
self.audio_stream = audio_stream # for cocktail party focus control
self.speakers: list[TrackedSpeaker] = [] self.speakers: list[TrackedSpeaker] = []
self.fixed_mode = False self.fixed_mode = False
self.target_speaker_idx: int = 0 # which speaker is the "target" (0 or 1)
self._lock = threading.Lock() self._lock = threading.Lock()
def update(self, doa: dict) -> dict: def update(self, doa: dict) -> dict:
@@ -144,7 +146,8 @@ class MultiSpeakerTracker:
return best return best
def _manage_beams(self): def _manage_beams(self):
"""Switch between auto and fixed beam mode based on speaker count.""" """Switch between auto and fixed beam mode based on speaker count.
Also manages audio focus for cocktail party filtering."""
stable_speakers = [s for s in self.speakers if s.stable] stable_speakers = [s for s in self.speakers if s.stable]
if len(stable_speakers) >= 2 and not self.fixed_mode: if len(stable_speakers) >= 2 and not self.fixed_mode:
@@ -157,34 +160,66 @@ class MultiSpeakerTracker:
s2.beam_locked = True s2.beam_locked = True
self.fixed_mode = True self.fixed_mode = True
# Focus audio on the target speaker's side
self._update_audio_focus()
elif len(stable_speakers) >= 2 and self.fixed_mode: elif len(stable_speakers) >= 2 and self.fixed_mode:
# Update beam directions if speakers moved # Update beam directions if speakers moved
s1, s2 = stable_speakers[0], stable_speakers[1] s1, s2 = stable_speakers[0], stable_speakers[1]
self.xvf.steer_beams(s1.angle, s2.angle) self.xvf.steer_beams(s1.angle, s2.angle)
# If the non-target speaker starts talking and target is silent,
# auto-switch target to the active one
target = stable_speakers[self.target_speaker_idx]
other_idx = 1 - self.target_speaker_idx
other = stable_speakers[other_idx]
if other.active and not target.active and target.silence_duration > 1.0:
self.target_speaker_idx = other_idx
logger.info("Attention shifted to speaker %d at %.0f°",
other_idx + 1, other.angle)
self._update_audio_focus()
elif len(stable_speakers) < 2 and self.fixed_mode: elif len(stable_speakers) < 2 and self.fixed_mode:
# Back to single speaker or silence — release beams # Back to single speaker or silence — release beams
logger.info("Releasing beams — back to auto mode") logger.info("Releasing beams — back to auto mode")
self.xvf.release_beams() self.xvf.release_beams()
self.fixed_mode = False self.fixed_mode = False
self.target_speaker_idx = 0
for s in self.speakers: for s in self.speakers:
s.beam_locked = False s.beam_locked = False
# Release audio focus
if self.audio_stream:
self.audio_stream.focus_side = None
def _update_audio_focus(self):
"""Set the audio stream to focus on the target speaker's side."""
if not self.audio_stream or not self.speakers:
return
stable = [s for s in self.speakers if s.stable]
if self.target_speaker_idx < len(stable):
target = stable[self.target_speaker_idx]
self.audio_stream.focus_side = target.side
logger.debug("Audio focus: %s (speaker at %.0f°)", target.side, target.angle)
def _get_state(self) -> dict: def _get_state(self) -> dict:
return { return {
"speakers": [ "speakers": [
{ {
"index": i,
"angle": round(s.angle, 1), "angle": round(s.angle, 1),
"side": s.side, "side": s.side,
"active": s.active, "active": s.active,
"beam_locked": s.beam_locked, "beam_locked": s.beam_locked,
"is_target": i == self.target_speaker_idx,
"age_seconds": round(s.age, 1), "age_seconds": round(s.age, 1),
"silence_seconds": round(s.silence_duration, 1), "silence_seconds": round(s.silence_duration, 1),
"speaker_name": s.speaker_name, "speaker_name": s.speaker_name,
} }
for s in self.speakers for i, s in enumerate(self.speakers)
], ],
"beam_mode": "fixed" if self.fixed_mode else "auto", "beam_mode": "fixed" if self.fixed_mode else "auto",
"target_speaker": self.target_speaker_idx,
"audio_focus": self.audio_stream.focus_side if self.audio_stream else None,
"active_count": sum(1 for s in self.speakers if s.active), "active_count": sum(1 for s in self.speakers if s.active),
"total_tracked": len(self.speakers), "total_tracked": len(self.speakers),
} }