""" Speaker Identification Module for HeadMic Resemblyzer GE2E speaker encoder — 256-dim embeddings, cosine similarity matching. Triggered when YAMNet detects speech. """ import logging import sqlite3 import time from pathlib import Path import numpy as np logger = logging.getLogger("speaker_id") logger.setLevel(logging.INFO) SIMILARITY_THRESHOLD = 0.75 class SpeakerRecognizer: def __init__(self, db_path="voices.db"): from resemblyzer import VoiceEncoder self._encoder = VoiceEncoder("cpu") logger.info("Resemblyzer voice encoder loaded") self._db_path = str(db_path) self._init_db() self._cache = self._load_embeddings() logger.info( "Speaker DB ready: %d embeddings for %d speakers", sum(len(v) for v in self._cache.values()), len(self._cache), ) def _init_db(self): with sqlite3.connect(self._db_path) as conn: conn.execute(""" CREATE TABLE IF NOT EXISTS voices ( id INTEGER PRIMARY KEY, name TEXT NOT NULL, embedding BLOB NOT NULL, enrolled_at REAL NOT NULL, source TEXT ) """) conn.execute( "CREATE INDEX IF NOT EXISTS idx_voices_name ON voices(name)" ) def _load_embeddings(self): """Load all embeddings from DB into memory, grouped by name.""" cache = {} with sqlite3.connect(self._db_path) as conn: rows = conn.execute("SELECT name, embedding FROM voices").fetchall() for name, blob in rows: emb = np.frombuffer(blob, dtype=np.float32).copy() cache.setdefault(name, []).append(emb) return cache def identify(self, audio_float32): """Identify speaker from float32 audio at 16kHz. Returns: (name, confidence) or (None, 0.0) if no match above threshold. """ if not self._cache: return None, 0.0 try: from resemblyzer import preprocess_wav wav = preprocess_wav(audio_float32, source_sr=16000) if len(wav) < 1600: # too short return None, 0.0 embedding = self._encoder.embed_utterance(wav) except Exception as e: logger.warning("Embedding computation failed: %s", e) return None, 0.0 best_name = None best_score = 0.0 for name, embeddings in self._cache.items(): # Best score across all enrolled samples for this speaker scores = [np.dot(embedding, emb) for emb in embeddings] top = max(scores) if top > best_score: best_score = top best_name = name if best_score >= SIMILARITY_THRESHOLD: return best_name, round(float(best_score), 3) return None, 0.0 def enroll(self, name, audio_float32, source="api"): """Enroll a speaker from float32 audio at 16kHz. Returns: The computed embedding (256-dim). """ from resemblyzer import preprocess_wav wav = preprocess_wav(audio_float32, source_sr=16000) if len(wav) < 1600: raise ValueError("Audio too short for enrollment") embedding = self._encoder.embed_utterance(wav) blob = embedding.astype(np.float32).tobytes() now = time.time() with sqlite3.connect(self._db_path) as conn: conn.execute( "INSERT INTO voices (name, embedding, enrolled_at, source) VALUES (?, ?, ?, ?)", (name, blob, now, source), ) self._cache.setdefault(name, []).append(embedding) logger.info("Enrolled speaker '%s' (source=%s, total=%d samples)", name, source, len(self._cache[name])) return embedding def list_speakers(self): """Return enrolled speaker names with sample counts.""" return {name: len(embs) for name, embs in self._cache.items()} def delete_speaker(self, name): """Remove all embeddings for a speaker.""" with sqlite3.connect(self._db_path) as conn: conn.execute("DELETE FROM voices WHERE name = ?", (name,)) removed = self._cache.pop(name, None) if removed: logger.info("Deleted speaker '%s' (%d samples)", name, len(removed)) return len(removed) return 0