Add speaker identification with Resemblyzer

Adds voice-based speaker ID triggered by YAMNet speech detection.
New speaker_id.py module with SQLite-backed voice enrollment and
cosine similarity matching. Endpoints: POST /speakers/enroll,
POST /speakers/enroll-from-mic, GET /speakers, DELETE /speakers/{name}.
Orange LED animation during enrollment.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alex
2026-02-01 21:21:02 -06:00
parent 0607be3db5
commit 1cb3bd6833
4 changed files with 281 additions and 4 deletions

133
speaker_id.py Normal file
View File

@@ -0,0 +1,133 @@
"""
Speaker Identification Module for HeadMic
Resemblyzer GE2E speaker encoder — 256-dim embeddings, cosine similarity matching.
Triggered when YAMNet detects speech.
"""
import logging
import sqlite3
import time
from pathlib import Path
import numpy as np
logger = logging.getLogger("speaker_id")
logger.setLevel(logging.INFO)
SIMILARITY_THRESHOLD = 0.75
class SpeakerRecognizer:
def __init__(self, db_path="voices.db"):
from resemblyzer import VoiceEncoder
self._encoder = VoiceEncoder("cpu")
logger.info("Resemblyzer voice encoder loaded")
self._db_path = str(db_path)
self._init_db()
self._cache = self._load_embeddings()
logger.info(
"Speaker DB ready: %d embeddings for %d speakers",
sum(len(v) for v in self._cache.values()),
len(self._cache),
)
def _init_db(self):
with sqlite3.connect(self._db_path) as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS voices (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
embedding BLOB NOT NULL,
enrolled_at REAL NOT NULL,
source TEXT
)
""")
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_voices_name ON voices(name)"
)
def _load_embeddings(self):
"""Load all embeddings from DB into memory, grouped by name."""
cache = {}
with sqlite3.connect(self._db_path) as conn:
rows = conn.execute("SELECT name, embedding FROM voices").fetchall()
for name, blob in rows:
emb = np.frombuffer(blob, dtype=np.float32).copy()
cache.setdefault(name, []).append(emb)
return cache
def identify(self, audio_float32):
"""Identify speaker from float32 audio at 16kHz.
Returns:
(name, confidence) or (None, 0.0) if no match above threshold.
"""
if not self._cache:
return None, 0.0
try:
from resemblyzer import preprocess_wav
wav = preprocess_wav(audio_float32, source_sr=16000)
if len(wav) < 1600: # too short
return None, 0.0
embedding = self._encoder.embed_utterance(wav)
except Exception as e:
logger.warning("Embedding computation failed: %s", e)
return None, 0.0
best_name = None
best_score = 0.0
for name, embeddings in self._cache.items():
# Best score across all enrolled samples for this speaker
scores = [np.dot(embedding, emb) for emb in embeddings]
top = max(scores)
if top > best_score:
best_score = top
best_name = name
if best_score >= SIMILARITY_THRESHOLD:
return best_name, round(float(best_score), 3)
return None, 0.0
def enroll(self, name, audio_float32, source="api"):
"""Enroll a speaker from float32 audio at 16kHz.
Returns:
The computed embedding (256-dim).
"""
from resemblyzer import preprocess_wav
wav = preprocess_wav(audio_float32, source_sr=16000)
if len(wav) < 1600:
raise ValueError("Audio too short for enrollment")
embedding = self._encoder.embed_utterance(wav)
blob = embedding.astype(np.float32).tobytes()
now = time.time()
with sqlite3.connect(self._db_path) as conn:
conn.execute(
"INSERT INTO voices (name, embedding, enrolled_at, source) VALUES (?, ?, ?, ?)",
(name, blob, now, source),
)
self._cache.setdefault(name, []).append(embedding)
logger.info("Enrolled speaker '%s' (source=%s, total=%d samples)", name, source, len(self._cache[name]))
return embedding
def list_speakers(self):
"""Return enrolled speaker names with sample counts."""
return {name: len(embs) for name, embs in self._cache.items()}
def delete_speaker(self, name):
"""Remove all embeddings for a speaker."""
with sqlite3.connect(self._db_path) as conn:
conn.execute("DELETE FROM voices WHERE name = ?", (name,))
removed = self._cache.pop(name, None)
if removed:
logger.info("Deleted speaker '%s' (%d samples)", name, len(removed))
return len(removed)
return 0