Add ILD-based distance estimation + proximity zones
Computes Interaural Level Difference (dB) from left/right ear energy. Fuses with triangulated distance (70/30 weight) for more robust estimate. Classifies into proximity zones: intimate (<0.5m), conversational (0.5-2m), across_room (2-5m), far (>5m). ILD→distance mapping is empirical and should be calibrated per install. Gaze vertical component now responds to proximity (closer = eyes down). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -423,8 +423,10 @@ def doa_track_loop():
|
|||||||
try:
|
try:
|
||||||
state.doa = xvf_manager.read_both_doa()
|
state.doa = xvf_manager.read_both_doa()
|
||||||
|
|
||||||
if spatial_tracker:
|
if spatial_tracker and dual_stream:
|
||||||
result = spatial_tracker.update(state.doa)
|
left_energy = dual_stream.left.get_energy() if dual_stream.left else 0.0
|
||||||
|
right_energy = dual_stream.right.get_energy() if dual_stream.right else 0.0
|
||||||
|
result = spatial_tracker.update(state.doa, left_energy, right_energy)
|
||||||
if result:
|
if result:
|
||||||
state.spatial = result
|
state.spatial = result
|
||||||
gx, gy = result["gaze_x"], result["gaze_y"]
|
gx, gy = result["gaze_x"], result["gaze_y"]
|
||||||
|
|||||||
83
spatial.py
83
spatial.py
@@ -26,6 +26,21 @@ SMOOTHING_ALPHA = 0.4 # exponential smoothing (0=sluggish, 1=instant) —
|
|||||||
IDLE_RETURN_SPEED = 0.03 # how fast gaze drifts to center when no VAD — gentle drift
|
IDLE_RETURN_SPEED = 0.03 # how fast gaze drifts to center when no VAD — gentle drift
|
||||||
IDLE_TIMEOUT_S = 1.5 # seconds of no VAD before drifting to center
|
IDLE_TIMEOUT_S = 1.5 # seconds of no VAD before drifting to center
|
||||||
|
|
||||||
|
# Distance estimation (ILD-based)
|
||||||
|
# ILD = 20 * log10(louder_energy / quieter_energy) in dB
|
||||||
|
# Empirical mapping: ILD varies with angle and distance.
|
||||||
|
# At 175mm separation, a source at 45° off-center produces:
|
||||||
|
# ~0.5m: ILD ≈ 6-10 dB
|
||||||
|
# ~1.5m: ILD ≈ 3-5 dB
|
||||||
|
# ~3.0m: ILD ≈ 1-2 dB
|
||||||
|
# These are rough — calibrate on real hardware.
|
||||||
|
PROXIMITY_ZONES = [
|
||||||
|
("intimate", 0, 500), # < 0.5m — whispering distance
|
||||||
|
("conversational", 500, 2000), # 0.5-2m — normal talking
|
||||||
|
("across_room", 2000, 5000), # 2-5m — raised voice
|
||||||
|
("far", 5000, 99999), # > 5m — shouting distance
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class SpatialTracker:
|
class SpatialTracker:
|
||||||
"""Triangulates sound source from two DoA angles and produces smooth gaze."""
|
"""Triangulates sound source from two DoA angles and produces smooth gaze."""
|
||||||
@@ -39,6 +54,8 @@ class SpatialTracker:
|
|||||||
self._smooth_y: float = 0.0 # mm, forward from skull
|
self._smooth_y: float = 0.0 # mm, forward from skull
|
||||||
self._smooth_gaze_x: float = float(GAZE_CENTER)
|
self._smooth_gaze_x: float = float(GAZE_CENTER)
|
||||||
self._smooth_gaze_y: float = float(GAZE_CENTER)
|
self._smooth_gaze_y: float = float(GAZE_CENTER)
|
||||||
|
self._smooth_distance: float = GAZE_MAX_DISTANCE_MM
|
||||||
|
self._smooth_ild: float = 0.0 # dB
|
||||||
|
|
||||||
# VAD tracking
|
# VAD tracking
|
||||||
self._last_vad_time: float = 0.0
|
self._last_vad_time: float = 0.0
|
||||||
@@ -47,16 +64,18 @@ class SpatialTracker:
|
|||||||
# Last raw result for API
|
# Last raw result for API
|
||||||
self.last_position: Optional[dict] = None
|
self.last_position: Optional[dict] = None
|
||||||
|
|
||||||
def update(self, doa: dict) -> Optional[dict]:
|
def update(self, doa: dict, left_energy: float = 0.0, right_energy: float = 0.0) -> Optional[dict]:
|
||||||
"""
|
"""
|
||||||
Process DoA readings from both arrays.
|
Process DoA readings + audio energy from both arrays.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
doa: {"left": {"angle": 0-359, "vad": bool}, "right": {"angle": 0-359, "vad": bool}}
|
doa: {"left": {"angle": 0-359, "vad": bool}, "right": {"angle": 0-359, "vad": bool}}
|
||||||
|
left_energy: RMS energy from left mic stream (0.0-1.0)
|
||||||
|
right_energy: RMS energy from right mic stream (0.0-1.0)
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
{"x_mm": float, "y_mm": float, "distance_mm": float,
|
{"x_mm", "y_mm", "distance_mm", "ild_db", "proximity",
|
||||||
"gaze_x": int, "gaze_y": int, "vad": bool, "side": str}
|
"gaze_x", "gaze_y", "vad", "side"}
|
||||||
or None if insufficient data.
|
or None if insufficient data.
|
||||||
"""
|
"""
|
||||||
left = doa.get("left")
|
left = doa.get("left")
|
||||||
@@ -79,10 +98,21 @@ class SpatialTracker:
|
|||||||
# Triangulate position
|
# Triangulate position
|
||||||
pos = self._triangulate(left_angle, right_angle)
|
pos = self._triangulate(left_angle, right_angle)
|
||||||
|
|
||||||
|
# Compute ILD (Interaural Level Difference)
|
||||||
|
ild_db = self._compute_ild(left_energy, right_energy)
|
||||||
|
|
||||||
if pos and any_vad:
|
if pos and any_vad:
|
||||||
# Smooth the position
|
# Smooth the position
|
||||||
self._smooth_x += SMOOTHING_ALPHA * (pos["x_mm"] - self._smooth_x)
|
self._smooth_x += SMOOTHING_ALPHA * (pos["x_mm"] - self._smooth_x)
|
||||||
self._smooth_y += SMOOTHING_ALPHA * (pos["y_mm"] - self._smooth_y)
|
self._smooth_y += SMOOTHING_ALPHA * (pos["y_mm"] - self._smooth_y)
|
||||||
|
self._smooth_ild += SMOOTHING_ALPHA * (ild_db - self._smooth_ild)
|
||||||
|
|
||||||
|
# Fuse triangulated distance with ILD
|
||||||
|
tri_dist = math.sqrt(self._smooth_x**2 + self._smooth_y**2)
|
||||||
|
ild_dist = self._ild_to_distance(self._smooth_ild)
|
||||||
|
# Weighted average: trust triangulation more (0.7) but let ILD correct it (0.3)
|
||||||
|
fused_dist = 0.7 * tri_dist + 0.3 * ild_dist
|
||||||
|
self._smooth_distance += SMOOTHING_ALPHA * (fused_dist - self._smooth_distance)
|
||||||
elif not any_vad:
|
elif not any_vad:
|
||||||
return self._idle_drift()
|
return self._idle_drift()
|
||||||
|
|
||||||
@@ -93,10 +123,15 @@ class SpatialTracker:
|
|||||||
self._smooth_gaze_x += SMOOTHING_ALPHA * (gaze_x - self._smooth_gaze_x)
|
self._smooth_gaze_x += SMOOTHING_ALPHA * (gaze_x - self._smooth_gaze_x)
|
||||||
self._smooth_gaze_y += SMOOTHING_ALPHA * (gaze_y - self._smooth_gaze_y)
|
self._smooth_gaze_y += SMOOTHING_ALPHA * (gaze_y - self._smooth_gaze_y)
|
||||||
|
|
||||||
|
# Classify proximity zone
|
||||||
|
proximity = self._classify_proximity(self._smooth_distance)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"x_mm": round(self._smooth_x, 1),
|
"x_mm": round(self._smooth_x, 1),
|
||||||
"y_mm": round(self._smooth_y, 1),
|
"y_mm": round(self._smooth_y, 1),
|
||||||
"distance_mm": round(math.sqrt(self._smooth_x**2 + self._smooth_y**2), 1),
|
"distance_mm": round(self._smooth_distance, 1),
|
||||||
|
"ild_db": round(self._smooth_ild, 1),
|
||||||
|
"proximity": proximity,
|
||||||
"gaze_x": int(round(self._smooth_gaze_x)),
|
"gaze_x": int(round(self._smooth_gaze_x)),
|
||||||
"gaze_y": int(round(self._smooth_gaze_y)),
|
"gaze_y": int(round(self._smooth_gaze_y)),
|
||||||
"vad": any_vad,
|
"vad": any_vad,
|
||||||
@@ -120,7 +155,9 @@ class SpatialTracker:
|
|||||||
result = {
|
result = {
|
||||||
"x_mm": round(self._smooth_x, 1),
|
"x_mm": round(self._smooth_x, 1),
|
||||||
"y_mm": round(self._smooth_y, 1),
|
"y_mm": round(self._smooth_y, 1),
|
||||||
"distance_mm": round(math.sqrt(self._smooth_x**2 + self._smooth_y**2), 1),
|
"distance_mm": round(self._smooth_distance, 1),
|
||||||
|
"ild_db": round(self._smooth_ild, 1),
|
||||||
|
"proximity": self._classify_proximity(self._smooth_distance),
|
||||||
"gaze_x": int(round(self._smooth_gaze_x)),
|
"gaze_x": int(round(self._smooth_gaze_x)),
|
||||||
"gaze_y": int(round(self._smooth_gaze_y)),
|
"gaze_y": int(round(self._smooth_gaze_y)),
|
||||||
"vad": False,
|
"vad": False,
|
||||||
@@ -129,6 +166,40 @@ class SpatialTracker:
|
|||||||
self.last_position = result
|
self.last_position = result
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _compute_ild(left_energy: float, right_energy: float) -> float:
|
||||||
|
"""Compute Interaural Level Difference in dB.
|
||||||
|
Positive = louder on left, negative = louder on right."""
|
||||||
|
# Clamp to avoid log(0)
|
||||||
|
left_e = max(left_energy, 1e-10)
|
||||||
|
right_e = max(right_energy, 1e-10)
|
||||||
|
return 20.0 * math.log10(left_e / right_e)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _ild_to_distance(ild_db: float) -> float:
|
||||||
|
"""Estimate distance from ILD magnitude.
|
||||||
|
Higher ILD = closer source (head shadow effect is stronger up close).
|
||||||
|
This is a rough empirical mapping — should be calibrated per-installation."""
|
||||||
|
ild_abs = abs(ild_db)
|
||||||
|
if ild_abs > 8.0:
|
||||||
|
return 300.0 # very close, ~30cm
|
||||||
|
elif ild_abs > 5.0:
|
||||||
|
return 700.0 # close, ~70cm
|
||||||
|
elif ild_abs > 3.0:
|
||||||
|
return 1500.0 # conversational, ~1.5m
|
||||||
|
elif ild_abs > 1.5:
|
||||||
|
return 2500.0 # across room, ~2.5m
|
||||||
|
else:
|
||||||
|
return 4000.0 # far or directly ahead (no ILD)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _classify_proximity(distance_mm: float) -> str:
|
||||||
|
"""Classify distance into a proximity zone."""
|
||||||
|
for name, lo, hi in PROXIMITY_ZONES:
|
||||||
|
if lo <= distance_mm < hi:
|
||||||
|
return name
|
||||||
|
return "far"
|
||||||
|
|
||||||
def _triangulate(self, left_deg: float, right_deg: float) -> Optional[dict]:
|
def _triangulate(self, left_deg: float, right_deg: float) -> Optional[dict]:
|
||||||
"""
|
"""
|
||||||
Triangulate sound source position from two DoA angles.
|
Triangulate sound source position from two DoA angles.
|
||||||
|
|||||||
Reference in New Issue
Block a user