""" Speaker Identification Module for HeadMic Resemblyzer GE2E speaker encoder — 256-dim embeddings, cosine similarity matching. Triggered when YAMNet detects speech. Supports both enrolled speakers ("Alex") and anonymous tracking ("unknown_a7f3") via online clustering of unrecognized embeddings. """ import hashlib import logging import sqlite3 import time from pathlib import Path import numpy as np logger = logging.getLogger("speaker_id") logger.setLevel(logging.INFO) SIMILARITY_THRESHOLD = 0.75 ANON_SIMILARITY_THRESHOLD = 0.70 # slightly looser for clustering unknowns ANON_MAX_TRACKED = 10 # max anonymous speakers to track ANON_EXPIRY_S = 3600 # forget anonymous speakers after 1 hour of silence class SpeakerRecognizer: def __init__(self, db_path="voices.db"): from resemblyzer import VoiceEncoder self._encoder = VoiceEncoder("cpu") logger.info("Resemblyzer voice encoder loaded") self._db_path = str(db_path) self._init_db() self._cache = self._load_embeddings() # Anonymous speaker tracking: short-lived clustering of unrecognized voices # Key: "unknown_XXXX", Value: {"embedding": avg_emb, "last_seen": time, "count": N} self._anon_speakers: dict[str, dict] = {} logger.info( "Speaker DB ready: %d embeddings for %d speakers", sum(len(v) for v in self._cache.values()), len(self._cache), ) def _init_db(self): with sqlite3.connect(self._db_path) as conn: conn.execute(""" CREATE TABLE IF NOT EXISTS voices ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, embedding BLOB NOT NULL, enrolled_at REAL NOT NULL, source TEXT ) """) conn.execute( "CREATE INDEX IF NOT EXISTS idx_voices_name ON voices(name)" ) def _load_embeddings(self): """Load all embeddings from DB into memory, grouped by name.""" cache = {} with sqlite3.connect(self._db_path) as conn: rows = conn.execute("SELECT name, embedding FROM voices").fetchall() for name, blob in rows: emb = np.frombuffer(blob, dtype=np.float32).copy() cache.setdefault(name, []).append(emb) return cache def identify(self, audio_float32): """Identify speaker from float32 audio at 16kHz. Returns: (name, confidence) where name is either an enrolled name ("Alex") or an anonymous tracker ID ("unknown_a7f3"). Returns (None, 0.0) only if the audio is too short to compute an embedding. """ try: from resemblyzer import preprocess_wav wav = preprocess_wav(audio_float32, source_sr=16000) if len(wav) < 1600: # too short return None, 0.0 embedding = self._encoder.embed_utterance(wav) except Exception as e: logger.warning("Embedding computation failed: %s", e) return None, 0.0 # First: check enrolled speakers best_name = None best_score = 0.0 for name, embeddings in self._cache.items(): scores = [np.dot(embedding, emb) for emb in embeddings] top = max(scores) if top > best_score: best_score = top best_name = name if best_score >= SIMILARITY_THRESHOLD: return best_name, round(float(best_score), 3) # Not enrolled — match or create anonymous speaker anon_name, anon_score = self._match_anonymous(embedding) return anon_name, round(float(anon_score), 3) def _match_anonymous(self, embedding: np.ndarray) -> tuple[str, float]: """Match embedding against tracked anonymous speakers, or create new one.""" now = time.time() # Expire old anonymous speakers expired = [k for k, v in self._anon_speakers.items() if now - v["last_seen"] > ANON_EXPIRY_S] for k in expired: logger.debug("Anonymous speaker %s expired", k) del self._anon_speakers[k] # Find best match among existing anonymous speakers best_id = None best_score = 0.0 for anon_id, info in self._anon_speakers.items(): score = float(np.dot(embedding, info["embedding"])) if score > best_score: best_score = score best_id = anon_id if best_score >= ANON_SIMILARITY_THRESHOLD and best_id: # Update the running average embedding info = self._anon_speakers[best_id] count = info["count"] # Incremental mean: new_avg = old_avg + (new - old_avg) / (count + 1) info["embedding"] = info["embedding"] + (embedding - info["embedding"]) / (count + 1) # Re-normalize (embeddings should be unit vectors) norm = np.linalg.norm(info["embedding"]) if norm > 0: info["embedding"] /= norm info["count"] = count + 1 info["last_seen"] = now return best_id, best_score # No match — create new anonymous speaker if len(self._anon_speakers) >= ANON_MAX_TRACKED: # Evict the oldest oldest = min(self._anon_speakers, key=lambda k: self._anon_speakers[k]["last_seen"]) del self._anon_speakers[oldest] anon_id = self._make_anon_id(embedding) self._anon_speakers[anon_id] = { "embedding": embedding.copy(), "last_seen": now, "first_seen": now, "count": 1, } logger.info("New anonymous speaker: %s", anon_id) return anon_id, 0.5 # moderate confidence for first sighting @staticmethod def _make_anon_id(embedding: np.ndarray) -> str: """Generate a stable short ID from an embedding. Same voice → same ID.""" # Quantize embedding to 8-bit and hash — similar voices get similar hashes quantized = ((embedding + 1.0) * 127.5).clip(0, 255).astype(np.uint8) h = hashlib.sha256(quantized.tobytes()).hexdigest()[:4] return f"unknown_{h}" def enroll(self, name, audio_float32, source="api"): """Enroll a speaker from float32 audio at 16kHz. Returns: The computed embedding (256-dim). """ from resemblyzer import preprocess_wav wav = preprocess_wav(audio_float32, source_sr=16000) if len(wav) < 1600: raise ValueError("Audio too short for enrollment") embedding = self._encoder.embed_utterance(wav) blob = embedding.astype(np.float32).tobytes() now = time.time() with sqlite3.connect(self._db_path) as conn: conn.execute( "INSERT INTO voices (name, embedding, enrolled_at, source) VALUES (?, ?, ?, ?)", (name, blob, now, source), ) self._cache.setdefault(name, []).append(embedding) logger.info("Enrolled speaker '%s' (source=%s, total=%d samples)", name, source, len(self._cache[name])) return embedding def list_speakers(self): """Return enrolled speaker names with sample counts.""" result = {name: len(embs) for name, embs in self._cache.items()} # Include active anonymous speakers for anon_id, info in self._anon_speakers.items(): result[anon_id] = info["count"] return result def promote_anonymous(self, anon_id: str, name: str) -> bool: """Promote an anonymous speaker to an enrolled speaker. Saves their averaged embedding to the database under the given name.""" if anon_id not in self._anon_speakers: return False info = self._anon_speakers.pop(anon_id) embedding = info["embedding"] blob = embedding.astype(np.float32).tobytes() now = time.time() with sqlite3.connect(self._db_path) as conn: conn.execute( "INSERT INTO voices (name, embedding, enrolled_at, source) VALUES (?, ?, ?, ?)", (name, blob, now, "promoted"), ) self._cache.setdefault(name, []).append(embedding) logger.info("Promoted %s → '%s' (%d observations)", anon_id, name, info["count"]) return True def delete_speaker(self, name): """Remove all embeddings for a speaker.""" with sqlite3.connect(self._db_path) as conn: conn.execute("DELETE FROM voices WHERE name = ?", (name,)) removed = self._cache.pop(name, None) if removed: logger.info("Deleted speaker '%s' (%d samples)", name, len(removed)) return len(removed) return 0