Add spatial audio scene mapping + sound event localization (#6 + #8)

spatial_scene.py: Builds a persistent map of where each sound category usually comes from (30° angle bins, circular mean). Detects anomalies when a sound appears from an unusual direction (90°+ deviation). Scene map persists to ~/.vixy/scene_map.json across restarts. headmic.py: Feed classified sounds + spatial position into scene tracker. New endpoints: /scene — learned scene summary + last anomaly /scene/events — recent events with what+where+when /scene/heatmap — per-category angular distribution (for visualization) Example: after running for a day, /scene might show: {"speech": {"usual_angle": 15.0, "observations": 847}, "music": {"usual_angle": 270.0, "observations": 312}} And if speech comes from 270° (where music usually is): spatial anomaly. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-12 21:17:29 -05:00
parent 2a25db8498
commit 8f71d97af6
2 changed files with 346 additions and 1 deletions
--- a/headmic.py
+++ b/headmic.py
@@ -160,6 +160,7 @@ class ServiceState:
        self.active_side: str = "left"    # which mic array is currently active
        self.doa: dict = {}               # latest DoA from both arrays
        self.spatial: Optional[dict] = None  # triangulated position + gaze
+        self.last_anomaly: Optional[dict] = None  # last spatial anomaly detected

 state = ServiceState()

@@ -173,6 +174,9 @@ enrollment_buffer = None  # list of frame bytes, set during enrollment

 # Binaural recorder
 binaural_recorder = None
+
+# Spatial scene
+spatial_scene = None
 enrollment_name = None

 # Audio stream
@@ -385,6 +389,18 @@ def sound_classifier_loop():
            audio_f32 = result.pop("audio_float32", None)
            state.audio_scene = result

+            # Spatial scene: log classified sound with its position
+            if spatial_scene and state.spatial and result.get("category"):
+                top = result.get("top_classes", [{}])[0] if result.get("top_classes") else {}
+                anomaly = spatial_scene.observe(
+                    category=result["category"],
+                    top_class=top.get("name", result["category"]),
+                    score=top.get("score", 0),
+                    spatial=state.spatial,
+                )
+                if anomaly:
+                    state.last_anomaly = anomaly
+
            # Speaker identification: run when speech detected
            if speaker_recognizer and result["category"] == "speech" and audio_f32 is not None:
                try:
@@ -466,7 +482,7 @@ app = FastAPI(title="HeadMic", description="Vixy's Ears 🦊👂 (Dual XVF3800)"

@app.on_event("startup")
 async def startup():
-    global sound_classifier, sound_ring_buffer, speaker_recognizer, dual_stream, LEDS_AVAILABLE, spatial_tracker, binaural_recorder
+    global sound_classifier, sound_ring_buffer, speaker_recognizer, dual_stream, LEDS_AVAILABLE, spatial_tracker, binaural_recorder, spatial_scene

    state.running = True

@@ -542,6 +558,11 @@ async def startup():
        logger.info("Spatial tracking started (%d Hz, %.0fmm baseline, pushing gaze to %s)",
                     DOA_POLL_HZ, array_sep, EYE_SERVICE_URL)

+    # --- Spatial scene mapping ---
+    from spatial_scene import SpatialScene
+    spatial_scene = SpatialScene()
+    spatial_scene.start()
+
    # --- Binaural recording ---
    if os.environ.get("BINAURAL_RECORD", "").lower() in ("1", "true", "yes"):
        from binaural_recorder import BinauralRecorder
@@ -561,6 +582,8 @@ async def startup():
 async def shutdown():
    state.running = False
    leds_off()
+    if spatial_scene:
+        spatial_scene.stop()
    if binaural_recorder:
        binaural_recorder.stop()
    if dual_stream:
@@ -629,6 +652,35 @@ async def doa():
    }


+# --- Spatial scene ---
+
+@app.get("/scene")
+async def scene():
+    """Learned spatial audio scene — where each sound type usually comes from."""
+    if not spatial_scene:
+        return {"scene": {}, "last_anomaly": None}
+    return {
+        "scene": spatial_scene.get_scene_summary(),
+        "last_anomaly": state.last_anomaly,
+    }
+
+
+@app.get("/scene/events")
+async def scene_events(seconds: int = 30, category: str = None):
+    """Recent sound events with spatial information."""
+    if not spatial_scene:
+        return {"events": []}
+    return {"events": spatial_scene.get_recent_events(seconds, category)}
+
+
+@app.get("/scene/heatmap")
+async def scene_heatmap():
+    """Observation counts per angle bin per category — for visualization."""
+    if not spatial_scene:
+        return {"heatmap": {}}
+    return {"heatmap": spatial_scene.get_spatial_heatmap()}
+
+
 # --- Binaural recording ---

@app.get("/recording")