diff --git a/headmic.py b/headmic.py index 15eddc2..2127626 100644 --- a/headmic.py +++ b/headmic.py @@ -423,8 +423,10 @@ def doa_track_loop(): try: state.doa = xvf_manager.read_both_doa() - if spatial_tracker: - result = spatial_tracker.update(state.doa) + if spatial_tracker and dual_stream: + left_energy = dual_stream.left.get_energy() if dual_stream.left else 0.0 + right_energy = dual_stream.right.get_energy() if dual_stream.right else 0.0 + result = spatial_tracker.update(state.doa, left_energy, right_energy) if result: state.spatial = result gx, gy = result["gaze_x"], result["gaze_y"] diff --git a/spatial.py b/spatial.py index f2cf1b4..11a170e 100644 --- a/spatial.py +++ b/spatial.py @@ -26,6 +26,21 @@ SMOOTHING_ALPHA = 0.4 # exponential smoothing (0=sluggish, 1=instant) — IDLE_RETURN_SPEED = 0.03 # how fast gaze drifts to center when no VAD — gentle drift IDLE_TIMEOUT_S = 1.5 # seconds of no VAD before drifting to center +# Distance estimation (ILD-based) +# ILD = 20 * log10(louder_energy / quieter_energy) in dB +# Empirical mapping: ILD varies with angle and distance. +# At 175mm separation, a source at 45° off-center produces: +# ~0.5m: ILD ≈ 6-10 dB +# ~1.5m: ILD ≈ 3-5 dB +# ~3.0m: ILD ≈ 1-2 dB +# These are rough — calibrate on real hardware. +PROXIMITY_ZONES = [ + ("intimate", 0, 500), # < 0.5m — whispering distance + ("conversational", 500, 2000), # 0.5-2m — normal talking + ("across_room", 2000, 5000), # 2-5m — raised voice + ("far", 5000, 99999), # > 5m — shouting distance +] + class SpatialTracker: """Triangulates sound source from two DoA angles and produces smooth gaze.""" @@ -39,6 +54,8 @@ class SpatialTracker: self._smooth_y: float = 0.0 # mm, forward from skull self._smooth_gaze_x: float = float(GAZE_CENTER) self._smooth_gaze_y: float = float(GAZE_CENTER) + self._smooth_distance: float = GAZE_MAX_DISTANCE_MM + self._smooth_ild: float = 0.0 # dB # VAD tracking self._last_vad_time: float = 0.0 @@ -47,16 +64,18 @@ class SpatialTracker: # Last raw result for API self.last_position: Optional[dict] = None - def update(self, doa: dict) -> Optional[dict]: + def update(self, doa: dict, left_energy: float = 0.0, right_energy: float = 0.0) -> Optional[dict]: """ - Process DoA readings from both arrays. + Process DoA readings + audio energy from both arrays. Args: doa: {"left": {"angle": 0-359, "vad": bool}, "right": {"angle": 0-359, "vad": bool}} + left_energy: RMS energy from left mic stream (0.0-1.0) + right_energy: RMS energy from right mic stream (0.0-1.0) Returns: - {"x_mm": float, "y_mm": float, "distance_mm": float, - "gaze_x": int, "gaze_y": int, "vad": bool, "side": str} + {"x_mm", "y_mm", "distance_mm", "ild_db", "proximity", + "gaze_x", "gaze_y", "vad", "side"} or None if insufficient data. """ left = doa.get("left") @@ -79,10 +98,21 @@ class SpatialTracker: # Triangulate position pos = self._triangulate(left_angle, right_angle) + # Compute ILD (Interaural Level Difference) + ild_db = self._compute_ild(left_energy, right_energy) + if pos and any_vad: # Smooth the position self._smooth_x += SMOOTHING_ALPHA * (pos["x_mm"] - self._smooth_x) self._smooth_y += SMOOTHING_ALPHA * (pos["y_mm"] - self._smooth_y) + self._smooth_ild += SMOOTHING_ALPHA * (ild_db - self._smooth_ild) + + # Fuse triangulated distance with ILD + tri_dist = math.sqrt(self._smooth_x**2 + self._smooth_y**2) + ild_dist = self._ild_to_distance(self._smooth_ild) + # Weighted average: trust triangulation more (0.7) but let ILD correct it (0.3) + fused_dist = 0.7 * tri_dist + 0.3 * ild_dist + self._smooth_distance += SMOOTHING_ALPHA * (fused_dist - self._smooth_distance) elif not any_vad: return self._idle_drift() @@ -93,10 +123,15 @@ class SpatialTracker: self._smooth_gaze_x += SMOOTHING_ALPHA * (gaze_x - self._smooth_gaze_x) self._smooth_gaze_y += SMOOTHING_ALPHA * (gaze_y - self._smooth_gaze_y) + # Classify proximity zone + proximity = self._classify_proximity(self._smooth_distance) + result = { "x_mm": round(self._smooth_x, 1), "y_mm": round(self._smooth_y, 1), - "distance_mm": round(math.sqrt(self._smooth_x**2 + self._smooth_y**2), 1), + "distance_mm": round(self._smooth_distance, 1), + "ild_db": round(self._smooth_ild, 1), + "proximity": proximity, "gaze_x": int(round(self._smooth_gaze_x)), "gaze_y": int(round(self._smooth_gaze_y)), "vad": any_vad, @@ -120,7 +155,9 @@ class SpatialTracker: result = { "x_mm": round(self._smooth_x, 1), "y_mm": round(self._smooth_y, 1), - "distance_mm": round(math.sqrt(self._smooth_x**2 + self._smooth_y**2), 1), + "distance_mm": round(self._smooth_distance, 1), + "ild_db": round(self._smooth_ild, 1), + "proximity": self._classify_proximity(self._smooth_distance), "gaze_x": int(round(self._smooth_gaze_x)), "gaze_y": int(round(self._smooth_gaze_y)), "vad": False, @@ -129,6 +166,40 @@ class SpatialTracker: self.last_position = result return result + @staticmethod + def _compute_ild(left_energy: float, right_energy: float) -> float: + """Compute Interaural Level Difference in dB. + Positive = louder on left, negative = louder on right.""" + # Clamp to avoid log(0) + left_e = max(left_energy, 1e-10) + right_e = max(right_energy, 1e-10) + return 20.0 * math.log10(left_e / right_e) + + @staticmethod + def _ild_to_distance(ild_db: float) -> float: + """Estimate distance from ILD magnitude. + Higher ILD = closer source (head shadow effect is stronger up close). + This is a rough empirical mapping — should be calibrated per-installation.""" + ild_abs = abs(ild_db) + if ild_abs > 8.0: + return 300.0 # very close, ~30cm + elif ild_abs > 5.0: + return 700.0 # close, ~70cm + elif ild_abs > 3.0: + return 1500.0 # conversational, ~1.5m + elif ild_abs > 1.5: + return 2500.0 # across room, ~2.5m + else: + return 4000.0 # far or directly ahead (no ILD) + + @staticmethod + def _classify_proximity(distance_mm: float) -> str: + """Classify distance into a proximity zone.""" + for name, lo, hi in PROXIMITY_ZONES: + if lo <= distance_mm < hi: + return name + return "far" + def _triangulate(self, left_deg: float, right_deg: float) -> Optional[dict]: """ Triangulate sound source position from two DoA angles.