diff --git a/docs/plans/2026-02-01-facial-recognition-design.md b/docs/plans/2026-02-01-facial-recognition-design.md new file mode 100644 index 0000000..266376c --- /dev/null +++ b/docs/plans/2026-02-01-facial-recognition-design.md @@ -0,0 +1,80 @@ +# Facial Recognition: OAK-D + Coral Edge TPU + +Add face detection and recognition to the oak-service spatial pipeline. + +## Architecture + +``` +OAK-D Lite (Myriad X) Coral Edge TPU Host (Pi 5) +────────────────────── ────────────── ─────────── +yolov6-nano spatial ssd_mobilenet_v2_face crop person bbox + → person bboxes → face bboxes cosine similarity + → spatial coords (X,Y,Z) arcface/facenet edgetpu vs SQLite DB + → RGB frames → 128-dim embedding → name + confidence +``` + +Per detection cycle (~0.5s): +1. OAK-D outputs person detections + spatial coords + RGB frame (unchanged) +2. Host crops upper-body region from RGB for each person bbox +3. Coral runs face detection on crop (ssd_mobilenet_v2_face edgetpu) +4. If face found, crop face, resize to model input, run embedding via Coral +5. Host compares embedding against SQLite DB (cosine similarity) +6. Attach recognized_name + recognition_confidence to detection + +## Setup: Coral Runtime + +Install pycoral + tflite-runtime in the oak-service venv: +```bash +pip install tflite-runtime pycoral +``` + +Download Edge TPU models: +- ssd_mobilenet_v2_face_quant_postprocess_edgetpu.tflite +- face embedding model (facenet or arcface quantized for edgetpu) + +Models stored in oak-service/models/ directory. + +## SQLite Face Database + +Path: configurable, default `faces.db` in service directory. + +```sql +CREATE TABLE faces ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + embedding BLOB NOT NULL, + enrolled_at REAL NOT NULL, + source TEXT +); +CREATE INDEX idx_faces_name ON faces(name); +``` + +- Multiple embeddings per person (different angles/lighting) +- Embedding stored as packed float32 bytes +- Matching: cosine similarity, threshold ~0.5 for positive match +- Best match across all embeddings for a name wins + +## API Changes + +New endpoints: +- `POST /faces/enroll` — multipart: name + photo, or name + use current frame +- `GET /faces` — list enrolled names with embedding count +- `DELETE /faces/{name}` — remove person from DB + +Modified responses: +- `/presence` adds: recognized_name, recognition_confidence +- `/detections` adds per-detection: recognized_name, recognition_confidence + +## Files + +- `oak_service_spatial.py` — add Coral face pipeline to detection loop +- `models/` — Edge TPU model files +- `faces.db` — SQLite database (created on first run) + +## Verification + +1. Install Coral runtime, verify device detected +2. Download face models, verify inference runs +3. Enroll a face via API +4. Test recognition: stand in front of camera, check /presence for name +5. Test unknown: different person, should show "unknown" diff --git a/face_recognition.py b/face_recognition.py new file mode 100644 index 0000000..bd31a33 --- /dev/null +++ b/face_recognition.py @@ -0,0 +1,270 @@ +""" +Face Recognition Module for OAK-D Vision Service +Coral Edge TPU for face detection + CPU FaceNet for embeddings + SQLite DB +""" + +import sqlite3 +import threading +import time +import logging +from pathlib import Path + +import ai_edge_litert.interpreter as tfl +import cv2 +import numpy as np + +logger = logging.getLogger("face_recognition") + +FACE_DETECT_THRESHOLD = 0.5 +RECOGNITION_THRESHOLD = 0.5 +EMBEDDING_DIM = 512 + + +class FaceRecognizer: + def __init__(self, face_model_path, embed_model_path, db_path="faces.db"): + self._lock = threading.Lock() + + # Coral face detector + logger.info("Loading face detection model on Edge TPU...") + delegate = tfl.load_delegate("libedgetpu.so.1") + self._face_interp = tfl.Interpreter( + model_path=str(face_model_path), + experimental_delegates=[delegate], + ) + self._face_interp.allocate_tensors() + self._face_input = self._face_interp.get_input_details()[0] + self._face_outputs = self._face_interp.get_output_details() + logger.info( + "Face detector ready: input %s %s", + self._face_input["shape"], + self._face_input["dtype"], + ) + + # CPU FaceNet embedder + logger.info("Loading FaceNet embedding model on CPU...") + self._embed_interp = tfl.Interpreter(model_path=str(embed_model_path)) + self._embed_interp.allocate_tensors() + self._embed_input = self._embed_interp.get_input_details()[0] + self._embed_output = self._embed_interp.get_output_details()[0] + logger.info( + "FaceNet ready: input %s, output %s", + self._embed_input["shape"], + self._embed_output["shape"], + ) + + # SQLite DB + self._db_path = str(db_path) + self._db = sqlite3.connect(self._db_path, check_same_thread=False) + self._db.execute( + """CREATE TABLE IF NOT EXISTS faces ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + embedding BLOB NOT NULL, + enrolled_at REAL NOT NULL, + source TEXT + )""" + ) + self._db.execute( + "CREATE INDEX IF NOT EXISTS idx_faces_name ON faces(name)" + ) + self._db.commit() + + # Load embedding cache + self._cache = [] # list of (name, embedding_array) + self._reload_cache() + logger.info("Face DB: %d embeddings loaded", len(self._cache)) + + def _reload_cache(self): + rows = self._db.execute("SELECT name, embedding FROM faces").fetchall() + cache = [] + for name, blob in rows: + emb = np.frombuffer(blob, dtype=np.float32).copy() + if len(emb) == EMBEDDING_DIM: + cache.append((name, emb)) + self._cache = cache + + def _detect_face(self, image): + """Run face detection on Coral. Returns best face bbox (y1,x1,y2,x2 in pixels) or None.""" + h, w = image.shape[:2] + inp_h, inp_w = self._face_input["shape"][1:3] + resized = cv2.resize(image, (inp_w, inp_h)) + if resized.dtype != np.uint8: + resized = resized.astype(np.uint8) + self._face_interp.set_tensor( + self._face_input["index"], resized[np.newaxis] + ) + self._face_interp.invoke() + + # Parse outputs: boxes [1,50,4], classes [1,50], scores [1,50], count [1] + boxes = self._face_interp.get_tensor(self._face_outputs[0]["index"])[0] + scores = self._face_interp.get_tensor(self._face_outputs[2]["index"])[0] + count = int( + self._face_interp.get_tensor(self._face_outputs[3]["index"])[0] + ) + + best_score = 0.0 + best_box = None + for i in range(min(count, len(scores))): + if scores[i] >= FACE_DETECT_THRESHOLD and scores[i] > best_score: + best_score = scores[i] + # boxes are [ymin, xmin, ymax, xmax] normalized 0-1 + ymin, xmin, ymax, xmax = boxes[i] + best_box = ( + max(0, int(ymin * h)), + max(0, int(xmin * w)), + min(h, int(ymax * h)), + min(w, int(xmax * w)), + ) + + return best_box, best_score + + def _compute_embedding(self, face_image): + """Compute 512-dim embedding from a face crop. Returns numpy array.""" + inp_h, inp_w = self._embed_input["shape"][1:3] + resized = cv2.resize(face_image, (inp_w, inp_h)) + # FaceNet preprocessing: normalize to [-1, 1] + normalized = (resized.astype(np.float32) / 127.5) - 1.0 + self._embed_interp.set_tensor( + self._embed_input["index"], normalized[np.newaxis] + ) + self._embed_interp.invoke() + return self._embed_interp.get_tensor(self._embed_output["index"])[0].copy() + + def _match_embedding(self, embedding): + """Match embedding against DB. Returns (name, confidence) or (None, 0.0).""" + cache = self._cache # snapshot reference + if not cache: + return None, 0.0 + + # Cosine similarity (embeddings are L2-normalized, so dot product works) + best_scores = {} # name -> best score + for name, stored_emb in cache: + score = float(np.dot(embedding, stored_emb)) + if name not in best_scores or score > best_scores[name]: + best_scores[name] = score + + if not best_scores: + return None, 0.0 + + best_name = max(best_scores, key=best_scores.get) + best_conf = best_scores[best_name] + + if best_conf >= RECOGNITION_THRESHOLD: + return best_name, best_conf + return None, best_conf + + def process_frame(self, rgb_frame, person_detections): + """Process an RGB frame with person detections, return face recognition results. + + Args: + rgb_frame: BGR numpy array from OAK-D (H, W, 3) + person_detections: list of depthai detection objects with + xmin/ymin/xmax/ymax (normalized 0-1) + + Returns: + list of dicts (same order as person_detections): + {recognized_name: str|None, recognition_confidence: float|None} + """ + h, w = rgb_frame.shape[:2] + results = [] + + for det in person_detections: + # Crop upper 40% of person bbox (head + shoulders) + px1 = max(0, int(det.xmin * w)) + py1 = max(0, int(det.ymin * h)) + px2 = min(w, int(det.xmax * w)) + py2 = min(h, int(det.ymax * h)) + + bbox_h = py2 - py1 + upper_y2 = py1 + int(bbox_h * 0.4) + + # Add 10% horizontal padding + pad_x = int((px2 - px1) * 0.1) + crop_x1 = max(0, px1 - pad_x) + crop_x2 = min(w, px2 + pad_x) + + crop = rgb_frame[py1:upper_y2, crop_x1:crop_x2] + if crop.size == 0: + results.append({"recognized_name": None, "recognition_confidence": None}) + continue + + # Face detection on Coral + face_box, face_score = self._detect_face(crop) + if face_box is None: + results.append({"recognized_name": None, "recognition_confidence": None}) + continue + + # Crop face and compute embedding + fy1, fx1, fy2, fx2 = face_box + face_crop = crop[fy1:fy2, fx1:fx2] + if face_crop.size == 0: + results.append({"recognized_name": None, "recognition_confidence": None}) + continue + + embedding = self._compute_embedding(face_crop) + name, confidence = self._match_embedding(embedding) + + results.append({ + "recognized_name": name, + "recognition_confidence": round(confidence, 3), + }) + + return results + + def enroll(self, name, image): + """Detect face in image, compute embedding, store in DB. + + Args: + name: person's name + image: BGR numpy array containing a face + + Returns: + dict with success status and embedding count + """ + face_box, face_score = self._detect_face(image) + if face_box is None: + return {"success": False, "error": "No face detected in image"} + + fy1, fx1, fy2, fx2 = face_box + face_crop = image[fy1:fy2, fx1:fx2] + if face_crop.size == 0: + return {"success": False, "error": "Face crop is empty"} + + embedding = self._compute_embedding(face_crop) + + with self._lock: + self._db.execute( + "INSERT INTO faces (name, embedding, enrolled_at, source) VALUES (?, ?, ?, ?)", + (name, embedding.tobytes(), time.time(), "api"), + ) + self._db.commit() + self._reload_cache() + + count = sum(1 for n, _ in self._cache if n == name) + logger.info("Enrolled face for '%s' (score=%.2f), %d total embeddings", name, face_score, count) + return {"success": True, "name": name, "embedding_count": count} + + def list_faces(self): + """Return list of enrolled names with embedding counts.""" + rows = self._db.execute( + "SELECT name, COUNT(*) as cnt, MIN(enrolled_at) as first " + "FROM faces GROUP BY name ORDER BY name" + ).fetchall() + return [ + {"name": r[0], "embedding_count": r[1], "enrolled_at": r[2]} + for r in rows + ] + + def delete_face(self, name): + """Remove all embeddings for a name.""" + with self._lock: + cur = self._db.execute("DELETE FROM faces WHERE name = ?", (name,)) + self._db.commit() + self._reload_cache() + deleted = cur.rowcount + logger.info("Deleted %d embeddings for '%s'", deleted, name) + return {"success": deleted > 0, "name": name, "deleted": deleted} + + def close(self): + """Close DB connection.""" + self._db.close() diff --git a/oak_service_spatial.py b/oak_service_spatial.py index 9537349..2150f76 100644 --- a/oak_service_spatial.py +++ b/oak_service_spatial.py @@ -11,13 +11,20 @@ Day 82 - SPATIAL UPGRADE! Now I know how far away you are! 📏🦊 import time import threading +import logging +from pathlib import Path from contextlib import asynccontextmanager -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, File, Form, HTTPException, UploadFile from fastapi.responses import Response import depthai as dai import cv2 import numpy as np +from face_recognition import FaceRecognizer + +logger = logging.getLogger("oak-service") +logging.basicConfig(level=logging.INFO) + # ============== Configuration ============== DETECTION_MODEL = "yolov6-nano" # Has 'person' class PERSON_CLASS_ID = 0 # 'person' is class 0 in COCO @@ -29,6 +36,12 @@ DETECTION_INTERVAL = 0.5 DEPTH_LOWER_THRESHOLD = 100 # 10cm minimum DEPTH_UPPER_THRESHOLD = 10000 # 10m maximum +# Face recognition models +MODELS_DIR = Path(__file__).parent / "models" +FACE_DETECT_MODEL = MODELS_DIR / "ssd_mobilenet_v2_face_quant_postprocess_edgetpu.tflite" +FACE_EMBED_MODEL = MODELS_DIR / "facenet.tflite" +FACE_DB_PATH = Path(__file__).parent / "faces.db" + # ============== Global State ============== pipeline_ctx = None detection_queue = None @@ -37,6 +50,7 @@ depth_queue = None detection_thread = None running = False labels = [] +face_recognizer = None presence_state = { "present": False, @@ -45,14 +59,35 @@ presence_state = { "last_detection": None, "detections": [], "confidence": 0.0, - # NEW: spatial data! + # Spatial data "distance_mm": None, "spatial_x": None, "spatial_y": None, "spatial_z": None, + # Face recognition + "recognized_name": None, + "recognition_confidence": None, } +def init_face_recognition(): + """Initialize Coral face detection + FaceNet embedding.""" + global face_recognizer + try: + face_recognizer = FaceRecognizer( + face_model_path=FACE_DETECT_MODEL, + embed_model_path=FACE_EMBED_MODEL, + db_path=FACE_DB_PATH, + ) + print("✅ Face recognition initialized (Coral + FaceNet)") + return True + except Exception as e: + print(f"⚠️ Face recognition unavailable: {e}") + import traceback + traceback.print_exc() + return False + + def init_oak(): """Initialize OAK-D with SPATIAL person detection pipeline (depthai v3).""" global pipeline_ctx, detection_queue, rgb_queue, depth_queue, labels @@ -123,9 +158,13 @@ def init_oak(): def cleanup_oak(): """Cleanup OAK-D resources.""" - global pipeline_ctx, running + global pipeline_ctx, running, face_recognizer running = False - + + if face_recognizer: + face_recognizer.close() + face_recognizer = None + if pipeline_ctx: try: pipeline_ctx.stop() @@ -162,30 +201,60 @@ def detection_loop(): if person_count > 0: presence_state["present"] = True presence_state["last_seen"] = now - + # Get highest confidence detection best = max(persons, key=lambda d: d.confidence) presence_state["confidence"] = best.confidence - - # SPATIAL DATA! 🎉 + + # Spatial data presence_state["spatial_x"] = best.spatialCoordinates.x presence_state["spatial_y"] = best.spatialCoordinates.y presence_state["spatial_z"] = best.spatialCoordinates.z - presence_state["distance_mm"] = best.spatialCoordinates.z # Z is depth - - presence_state["detections"] = [ - { + presence_state["distance_mm"] = best.spatialCoordinates.z + + # Face recognition + face_results = [] + if face_recognizer and rgb_queue: + rgb_data = rgb_queue.tryGet() + if rgb_data is not None: + rgb_frame = rgb_data.getCvFrame() + try: + face_results = face_recognizer.process_frame( + rgb_frame, persons + ) + except Exception as e: + logger.warning("Face recognition error: %s", e) + + det_list = [] + best_recognized = None + best_recog_conf = 0.0 + for i, d in enumerate(persons): + det = { "xmin": d.xmin, "ymin": d.ymin, "xmax": d.xmax, "ymax": d.ymax, "confidence": d.confidence, - # Spatial coordinates in mm "x_mm": d.spatialCoordinates.x, "y_mm": d.spatialCoordinates.y, "z_mm": d.spatialCoordinates.z, "distance_m": d.spatialCoordinates.z / 1000.0, + "recognized_name": None, + "recognition_confidence": None, } - for d in persons - ] + if i < len(face_results): + det["recognized_name"] = face_results[i]["recognized_name"] + det["recognition_confidence"] = face_results[i]["recognition_confidence"] + if det["recognized_name"] and ( + det["recognition_confidence"] or 0 + ) > best_recog_conf: + best_recognized = det["recognized_name"] + best_recog_conf = det["recognition_confidence"] + det_list.append(det) + + presence_state["detections"] = det_list + presence_state["recognized_name"] = best_recognized + presence_state["recognition_confidence"] = ( + round(best_recog_conf, 3) if best_recognized else None + ) else: presence_state["detections"] = [] presence_state["confidence"] = 0.0 @@ -193,7 +262,9 @@ def detection_loop(): presence_state["spatial_y"] = None presence_state["spatial_z"] = None presence_state["distance_mm"] = None - + presence_state["recognized_name"] = None + presence_state["recognition_confidence"] = None + # Check timeout if presence_state["last_seen"]: if now - presence_state["last_seen"] > PRESENCE_TIMEOUT: @@ -214,7 +285,9 @@ async def lifespan(app: FastAPI): global running, detection_thread print("🦊 Starting OAK-D SPATIAL Vision Service...") - + + init_face_recognition() + if init_oak(): running = True detection_thread = threading.Thread(target=detection_loop, daemon=True) @@ -231,8 +304,8 @@ async def lifespan(app: FastAPI): app = FastAPI( title="OAK-D SPATIAL Vision Service", - description="Vixy's eyes with SPATIAL presence detection! 🦊👀📏", - version="0.4.0", + description="Vixy's eyes with SPATIAL presence detection + face recognition! 🦊👀📏", + version="0.5.0", lifespan=lifespan ) @@ -243,10 +316,11 @@ async def health(): return { "status": "healthy", "service": "oak-service", - "version": "0.4.0", + "version": "0.5.0", "oak_connected": pipeline_ctx is not None, "detection_model": DETECTION_MODEL, "spatial_enabled": True, + "face_recognition_enabled": face_recognizer is not None, "timestamp": time.time() } @@ -267,7 +341,6 @@ async def presence(): if presence_state["last_seen"] else None ), "confidence": presence_state["confidence"], - # SPATIAL DATA "distance_mm": presence_state["distance_mm"], "distance_m": distance_m, "spatial": { @@ -275,6 +348,8 @@ async def presence(): "y_mm": presence_state["spatial_y"], "z_mm": presence_state["spatial_z"], } if presence_state["spatial_z"] else None, + "recognized_name": presence_state["recognized_name"], + "recognition_confidence": presence_state["recognition_confidence"], "timestamp": time.time() } @@ -340,6 +415,68 @@ async def depth_frame(): raise HTTPException(status_code=500, detail=str(e)) +# ============== Face Enrollment API ============== + + +@app.post("/faces/enroll") +async def enroll_face_upload( + name: str = Form(...), + photo: UploadFile = File(...), +): + """Enroll a face by uploading a photo (multipart form: name + photo).""" + if face_recognizer is None: + raise HTTPException(status_code=503, detail="Face recognition not available") + + contents = await photo.read() + nparr = np.frombuffer(contents, np.uint8) + image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) + if image is None: + raise HTTPException(status_code=400, detail="Could not decode image") + + result = face_recognizer.enroll(name, image) + if not result["success"]: + raise HTTPException(status_code=400, detail=result["error"]) + return result + + +@app.post("/faces/enroll-from-camera") +async def enroll_face_camera(name: str): + """Enroll a face using the current camera frame. Pass name as query param.""" + if face_recognizer is None: + raise HTTPException(status_code=503, detail="Face recognition not available") + if rgb_queue is None: + raise HTTPException(status_code=503, detail="Camera not available") + + frame_data = rgb_queue.tryGet() + if frame_data is None: + raise HTTPException(status_code=503, detail="No frame available") + + image = frame_data.getCvFrame() + result = face_recognizer.enroll(name, image) + if not result["success"]: + raise HTTPException(status_code=400, detail=result["error"]) + return result + + +@app.get("/faces") +async def list_faces(): + """List enrolled faces.""" + if face_recognizer is None: + raise HTTPException(status_code=503, detail="Face recognition not available") + return {"faces": face_recognizer.list_faces()} + + +@app.delete("/faces/{name}") +async def delete_face(name: str): + """Remove all embeddings for a person.""" + if face_recognizer is None: + raise HTTPException(status_code=503, detail="Face recognition not available") + result = face_recognizer.delete_face(name) + if not result["success"]: + raise HTTPException(status_code=404, detail=f"No face found for '{name}'") + return result + + if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8100)