Add spatial audio scene mapping + sound event localization (#6 + #8)

spatial_scene.py: Builds a persistent map of where each sound category
usually comes from (30° angle bins, circular mean). Detects anomalies
when a sound appears from an unusual direction (90°+ deviation).
Scene map persists to ~/.vixy/scene_map.json across restarts.

headmic.py: Feed classified sounds + spatial position into scene tracker.
New endpoints:
  /scene — learned scene summary + last anomaly
  /scene/events — recent events with what+where+when
  /scene/heatmap — per-category angular distribution (for visualization)

Example: after running for a day, /scene might show:
  {"speech": {"usual_angle": 15.0, "observations": 847},
   "music": {"usual_angle": 270.0, "observations": 312}}
And if speech comes from 270° (where music usually is): spatial anomaly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex
2026-04-12 21:17:29 -05:00
parent 2a25db8498
commit 8f71d97af6
2 changed files with 346 additions and 1 deletions

View File

@@ -160,6 +160,7 @@ class ServiceState:
self.active_side: str = "left" # which mic array is currently active
self.doa: dict = {} # latest DoA from both arrays
self.spatial: Optional[dict] = None # triangulated position + gaze
self.last_anomaly: Optional[dict] = None # last spatial anomaly detected
state = ServiceState()
@@ -173,6 +174,9 @@ enrollment_buffer = None # list of frame bytes, set during enrollment
# Binaural recorder
binaural_recorder = None
# Spatial scene
spatial_scene = None
enrollment_name = None
# Audio stream
@@ -385,6 +389,18 @@ def sound_classifier_loop():
audio_f32 = result.pop("audio_float32", None)
state.audio_scene = result
# Spatial scene: log classified sound with its position
if spatial_scene and state.spatial and result.get("category"):
top = result.get("top_classes", [{}])[0] if result.get("top_classes") else {}
anomaly = spatial_scene.observe(
category=result["category"],
top_class=top.get("name", result["category"]),
score=top.get("score", 0),
spatial=state.spatial,
)
if anomaly:
state.last_anomaly = anomaly
# Speaker identification: run when speech detected
if speaker_recognizer and result["category"] == "speech" and audio_f32 is not None:
try:
@@ -466,7 +482,7 @@ app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂 (Dual XVF3800)"
@app.on_event("startup")
async def startup():
global sound_classifier, sound_ring_buffer, speaker_recognizer, dual_stream, LEDS_AVAILABLE, spatial_tracker, binaural_recorder
global sound_classifier, sound_ring_buffer, speaker_recognizer, dual_stream, LEDS_AVAILABLE, spatial_tracker, binaural_recorder, spatial_scene
state.running = True
@@ -542,6 +558,11 @@ async def startup():
logger.info("Spatial tracking started (%d Hz, %.0fmm baseline, pushing gaze to %s)",
DOA_POLL_HZ, array_sep, EYE_SERVICE_URL)
# --- Spatial scene mapping ---
from spatial_scene import SpatialScene
spatial_scene = SpatialScene()
spatial_scene.start()
# --- Binaural recording ---
if os.environ.get("BINAURAL_RECORD", "").lower() in ("1", "true", "yes"):
from binaural_recorder import BinauralRecorder
@@ -561,6 +582,8 @@ async def startup():
async def shutdown():
state.running = False
leds_off()
if spatial_scene:
spatial_scene.stop()
if binaural_recorder:
binaural_recorder.stop()
if dual_stream:
@@ -629,6 +652,35 @@ async def doa():
}
# --- Spatial scene ---
@app.get("/scene")
async def scene():
"""Learned spatial audio scene — where each sound type usually comes from."""
if not spatial_scene:
return {"scene": {}, "last_anomaly": None}
return {
"scene": spatial_scene.get_scene_summary(),
"last_anomaly": state.last_anomaly,
}
@app.get("/scene/events")
async def scene_events(seconds: int = 30, category: str = None):
"""Recent sound events with spatial information."""
if not spatial_scene:
return {"events": []}
return {"events": spatial_scene.get_recent_events(seconds, category)}
@app.get("/scene/heatmap")
async def scene_heatmap():
"""Observation counts per angle bin per category — for visualization."""
if not spatial_scene:
return {"heatmap": {}}
return {"heatmap": spatial_scene.get_spatial_heatmap()}
# --- Binaural recording ---
@app.get("/recording")