From 05034acd27a2ddde4b435b40c0140f0632ff45df Mon Sep 17 00:00:00 2001
From: Alex <akazaev@proton.me>
Date: Sun, 12 Apr 2026 21:58:30 -0500
Subject: [PATCH] Add anonymous speaker tracking (online diarization)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Unrecognized speakers now get stable IDs like "unknown_a7f3" instead
of None. Uses online clustering of Resemblyzer embeddings:
- Matches against tracked anonymous speakers (cosine > 0.70)
- Updates running average embedding on re-identification
- Creates new ID from SHA-256 hash of quantized embedding
- Expires after 1 hour of silence, max 10 tracked simultaneously

New API: POST /speakers/promote?anon_id=unknown_a7f3&name=Alex
Promotes an anonymous speaker to enrolled using their averaged embedding.

Flow: unknown person speaks → "unknown_a7f3" → you ask "who's that?" →
promote to "Bob" → now recognized by name going forward.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 headmic.py    |  13 ++++++
 speaker_id.py | 108 ++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 114 insertions(+), 7 deletions(-)

diff --git a/headmic.py b/headmic.py
index f2b8dc1..f1c545a 100644
--- a/headmic.py
+++ b/headmic.py
@@ -868,6 +868,19 @@ async def list_speakers():
     return {"speakers": speaker_recognizer.list_speakers()}
 
 
+@app.post("/speakers/promote")
+async def promote_speaker(anon_id: str, name: str):
+    """Promote an anonymous speaker (unknown_XXXX) to an enrolled speaker.
+    Uses their accumulated embedding average — no new audio needed."""
+    if speaker_recognizer is None:
+        raise HTTPException(status_code=503, detail="Speaker recognition not available")
+    if not anon_id.startswith("unknown_"):
+        raise HTTPException(status_code=400, detail="anon_id must start with 'unknown_'")
+    if speaker_recognizer.promote_anonymous(anon_id, name):
+        return {"promoted": anon_id, "name": name, "speakers": speaker_recognizer.list_speakers()}
+    raise HTTPException(status_code=404, detail=f"Anonymous speaker '{anon_id}' not found")
+
+
 @app.delete("/speakers/{name}")
 async def delete_speaker(name: str):
     """Remove a speaker."""
diff --git a/speaker_id.py b/speaker_id.py
index 46d761c..fd898eb 100644
--- a/speaker_id.py
+++ b/speaker_id.py
@@ -2,8 +2,12 @@
 Speaker Identification Module for HeadMic
 Resemblyzer GE2E speaker encoder — 256-dim embeddings, cosine similarity matching.
 Triggered when YAMNet detects speech.
+
+Supports both enrolled speakers ("Alex") and anonymous tracking ("unknown_a7f3")
+via online clustering of unrecognized embeddings.
 """
 
+import hashlib
 import logging
 import sqlite3
 import time
@@ -15,6 +19,9 @@ logger = logging.getLogger("speaker_id")
 logger.setLevel(logging.INFO)
 
 SIMILARITY_THRESHOLD = 0.75
+ANON_SIMILARITY_THRESHOLD = 0.70  # slightly looser for clustering unknowns
+ANON_MAX_TRACKED = 10             # max anonymous speakers to track
+ANON_EXPIRY_S = 3600              # forget anonymous speakers after 1 hour of silence
 
 
 class SpeakerRecognizer:
@@ -27,6 +34,11 @@ class SpeakerRecognizer:
         self._db_path = str(db_path)
         self._init_db()
         self._cache = self._load_embeddings()
+
+        # Anonymous speaker tracking: short-lived clustering of unrecognized voices
+        # Key: "unknown_XXXX", Value: {"embedding": avg_emb, "last_seen": time, "count": N}
+        self._anon_speakers: dict[str, dict] = {}
+
         logger.info(
             "Speaker DB ready: %d embeddings for %d speakers",
             sum(len(v) for v in self._cache.values()),
@@ -62,11 +74,10 @@ class SpeakerRecognizer:
         """Identify speaker from float32 audio at 16kHz.
 
         Returns:
-            (name, confidence) or (None, 0.0) if no match above threshold.
+            (name, confidence) where name is either an enrolled name ("Alex")
+            or an anonymous tracker ID ("unknown_a7f3"). Returns (None, 0.0)
+            only if the audio is too short to compute an embedding.
         """
-        if not self._cache:
-            return None, 0.0
-
         try:
             from resemblyzer import preprocess_wav
             wav = preprocess_wav(audio_float32, source_sr=16000)
@@ -77,11 +88,11 @@ class SpeakerRecognizer:
             logger.warning("Embedding computation failed: %s", e)
             return None, 0.0
 
+        # First: check enrolled speakers
         best_name = None
         best_score = 0.0
 
         for name, embeddings in self._cache.items():
-            # Best score across all enrolled samples for this speaker
             scores = [np.dot(embedding, emb) for emb in embeddings]
             top = max(scores)
             if top > best_score:
@@ -90,7 +101,68 @@ class SpeakerRecognizer:
 
         if best_score >= SIMILARITY_THRESHOLD:
             return best_name, round(float(best_score), 3)
-        return None, 0.0
+
+        # Not enrolled — match or create anonymous speaker
+        anon_name, anon_score = self._match_anonymous(embedding)
+        return anon_name, round(float(anon_score), 3)
+
+    def _match_anonymous(self, embedding: np.ndarray) -> tuple[str, float]:
+        """Match embedding against tracked anonymous speakers, or create new one."""
+        now = time.time()
+
+        # Expire old anonymous speakers
+        expired = [k for k, v in self._anon_speakers.items()
+                   if now - v["last_seen"] > ANON_EXPIRY_S]
+        for k in expired:
+            logger.debug("Anonymous speaker %s expired", k)
+            del self._anon_speakers[k]
+
+        # Find best match among existing anonymous speakers
+        best_id = None
+        best_score = 0.0
+        for anon_id, info in self._anon_speakers.items():
+            score = float(np.dot(embedding, info["embedding"]))
+            if score > best_score:
+                best_score = score
+                best_id = anon_id
+
+        if best_score >= ANON_SIMILARITY_THRESHOLD and best_id:
+            # Update the running average embedding
+            info = self._anon_speakers[best_id]
+            count = info["count"]
+            # Incremental mean: new_avg = old_avg + (new - old_avg) / (count + 1)
+            info["embedding"] = info["embedding"] + (embedding - info["embedding"]) / (count + 1)
+            # Re-normalize (embeddings should be unit vectors)
+            norm = np.linalg.norm(info["embedding"])
+            if norm > 0:
+                info["embedding"] /= norm
+            info["count"] = count + 1
+            info["last_seen"] = now
+            return best_id, best_score
+
+        # No match — create new anonymous speaker
+        if len(self._anon_speakers) >= ANON_MAX_TRACKED:
+            # Evict the oldest
+            oldest = min(self._anon_speakers, key=lambda k: self._anon_speakers[k]["last_seen"])
+            del self._anon_speakers[oldest]
+
+        anon_id = self._make_anon_id(embedding)
+        self._anon_speakers[anon_id] = {
+            "embedding": embedding.copy(),
+            "last_seen": now,
+            "first_seen": now,
+            "count": 1,
+        }
+        logger.info("New anonymous speaker: %s", anon_id)
+        return anon_id, 0.5  # moderate confidence for first sighting
+
+    @staticmethod
+    def _make_anon_id(embedding: np.ndarray) -> str:
+        """Generate a stable short ID from an embedding. Same voice → same ID."""
+        # Quantize embedding to 8-bit and hash — similar voices get similar hashes
+        quantized = ((embedding + 1.0) * 127.5).clip(0, 255).astype(np.uint8)
+        h = hashlib.sha256(quantized.tobytes()).hexdigest()[:4]
+        return f"unknown_{h}"
 
     def enroll(self, name, audio_float32, source="api"):
         """Enroll a speaker from float32 audio at 16kHz.
@@ -120,7 +192,29 @@ class SpeakerRecognizer:
 
     def list_speakers(self):
         """Return enrolled speaker names with sample counts."""
-        return {name: len(embs) for name, embs in self._cache.items()}
+        result = {name: len(embs) for name, embs in self._cache.items()}
+        # Include active anonymous speakers
+        for anon_id, info in self._anon_speakers.items():
+            result[anon_id] = info["count"]
+        return result
+
+    def promote_anonymous(self, anon_id: str, name: str) -> bool:
+        """Promote an anonymous speaker to an enrolled speaker.
+        Saves their averaged embedding to the database under the given name."""
+        if anon_id not in self._anon_speakers:
+            return False
+        info = self._anon_speakers.pop(anon_id)
+        embedding = info["embedding"]
+        blob = embedding.astype(np.float32).tobytes()
+        now = time.time()
+        with sqlite3.connect(self._db_path) as conn:
+            conn.execute(
+                "INSERT INTO voices (name, embedding, enrolled_at, source) VALUES (?, ?, ?, ?)",
+                (name, blob, now, "promoted"),
+            )
+        self._cache.setdefault(name, []).append(embedding)
+        logger.info("Promoted %s → '%s' (%d observations)", anon_id, name, info["count"])
+        return True
 
     def delete_speaker(self, name):
         """Remove all embeddings for a speaker."""