facial recognition

2026-02-01 11:36:22 -06:00
parent 3c273d7d02
commit 3ac8778cac
3 changed files with 507 additions and 20 deletions
--- a/docs/plans/2026-02-01-facial-recognition-design.md
+++ b/docs/plans/2026-02-01-facial-recognition-design.md
@@ -0,0 +1,80 @@
 # Facial Recognition: OAK-D + Coral Edge TPU
 Add face detection and recognition to the oak-service spatial pipeline.
 ## Architecture
 ```
 OAK-D Lite (Myriad X)              Coral Edge TPU              Host (Pi 5)
 ──────────────────────              ──────────────              ───────────
 yolov6-nano spatial                 ssd_mobilenet_v2_face       crop person bbox
  → person bboxes                     → face bboxes             cosine similarity
  → spatial coords (X,Y,Z)         arcface/facenet edgetpu      vs SQLite DB
  → RGB frames                       → 128-dim embedding        → name + confidence
 ```
 Per detection cycle (~0.5s):
 1. OAK-D outputs person detections + spatial coords + RGB frame (unchanged)
 2. Host crops upper-body region from RGB for each person bbox
 3. Coral runs face detection on crop (ssd_mobilenet_v2_face edgetpu)
 4. If face found, crop face, resize to model input, run embedding via Coral
 5. Host compares embedding against SQLite DB (cosine similarity)
 6. Attach recognized_name + recognition_confidence to detection
 ## Setup: Coral Runtime
 Install pycoral + tflite-runtime in the oak-service venv:
 ```bash
 pip install tflite-runtime pycoral
 ```
 Download Edge TPU models:
 - ssd_mobilenet_v2_face_quant_postprocess_edgetpu.tflite
 - face embedding model (facenet or arcface quantized for edgetpu)
 Models stored in oak-service/models/ directory.
 ## SQLite Face Database
 Path: configurable, default `faces.db` in service directory.
 ```sql
 CREATE TABLE faces (
    id INTEGER PRIMARY KEY,
    name TEXT NOT NULL,
    embedding BLOB NOT NULL,
    enrolled_at REAL NOT NULL,
    source TEXT
 );
 CREATE INDEX idx_faces_name ON faces(name);
 ```
 - Multiple embeddings per person (different angles/lighting)
 - Embedding stored as packed float32 bytes
 - Matching: cosine similarity, threshold ~0.5 for positive match
 - Best match across all embeddings for a name wins
 ## API Changes
 New endpoints:
 - `POST /faces/enroll` — multipart: name + photo, or name + use current frame
 - `GET /faces` — list enrolled names with embedding count
 - `DELETE /faces/{name}` — remove person from DB
 Modified responses:
 - `/presence` adds: recognized_name, recognition_confidence
 - `/detections` adds per-detection: recognized_name, recognition_confidence
 ## Files
 - `oak_service_spatial.py` — add Coral face pipeline to detection loop
 - `models/` — Edge TPU model files
 - `faces.db` — SQLite database (created on first run)
 ## Verification
 1. Install Coral runtime, verify device detected
 2. Download face models, verify inference runs
 3. Enroll a face via API
 4. Test recognition: stand in front of camera, check /presence for name
 5. Test unknown: different person, should show "unknown"
--- a/face_recognition.py
+++ b/face_recognition.py
@@ -0,0 +1,270 @@
 """
 Face Recognition Module for OAK-D Vision Service
 Coral Edge TPU for face detection + CPU FaceNet for embeddings + SQLite DB
 """
 import sqlite3
 import threading
 import time
 import logging
 from pathlib import Path
 import ai_edge_litert.interpreter as tfl
 import cv2
 import numpy as np
 logger = logging.getLogger("face_recognition")
 FACE_DETECT_THRESHOLD = 0.5
 RECOGNITION_THRESHOLD = 0.5
 EMBEDDING_DIM = 512
 class FaceRecognizer:
    def __init__(self, face_model_path, embed_model_path, db_path="faces.db"):
        self._lock = threading.Lock()
        # Coral face detector
        logger.info("Loading face detection model on Edge TPU...")
        delegate = tfl.load_delegate("libedgetpu.so.1")
        self._face_interp = tfl.Interpreter(
            model_path=str(face_model_path),
            experimental_delegates=[delegate],
        )
        self._face_interp.allocate_tensors()
        self._face_input = self._face_interp.get_input_details()[0]
        self._face_outputs = self._face_interp.get_output_details()
        logger.info(
            "Face detector ready: input %s %s",
            self._face_input["shape"],
            self._face_input["dtype"],
        )
        # CPU FaceNet embedder
        logger.info("Loading FaceNet embedding model on CPU...")
        self._embed_interp = tfl.Interpreter(model_path=str(embed_model_path))
        self._embed_interp.allocate_tensors()
        self._embed_input = self._embed_interp.get_input_details()[0]
        self._embed_output = self._embed_interp.get_output_details()[0]
        logger.info(
            "FaceNet ready: input %s, output %s",
            self._embed_input["shape"],
            self._embed_output["shape"],
        )
        # SQLite DB
        self._db_path = str(db_path)
        self._db = sqlite3.connect(self._db_path, check_same_thread=False)
        self._db.execute(
            """CREATE TABLE IF NOT EXISTS faces (
                id INTEGER PRIMARY KEY,
                name TEXT NOT NULL,
                embedding BLOB NOT NULL,
                enrolled_at REAL NOT NULL,
                source TEXT
            )"""
        )
        self._db.execute(
            "CREATE INDEX IF NOT EXISTS idx_faces_name ON faces(name)"
        )
        self._db.commit()
        # Load embedding cache
        self._cache = []  # list of (name, embedding_array)
        self._reload_cache()
        logger.info("Face DB: %d embeddings loaded", len(self._cache))
    def _reload_cache(self):
        rows = self._db.execute("SELECT name, embedding FROM faces").fetchall()
        cache = []
        for name, blob in rows:
            emb = np.frombuffer(blob, dtype=np.float32).copy()
            if len(emb) == EMBEDDING_DIM:
                cache.append((name, emb))
        self._cache = cache
    def _detect_face(self, image):
        """Run face detection on Coral. Returns best face bbox (y1,x1,y2,x2 in pixels) or None."""
        h, w = image.shape[:2]
        inp_h, inp_w = self._face_input["shape"][1:3]
        resized = cv2.resize(image, (inp_w, inp_h))
        if resized.dtype != np.uint8:
            resized = resized.astype(np.uint8)
        self._face_interp.set_tensor(
            self._face_input["index"], resized[np.newaxis]
        )
        self._face_interp.invoke()
        # Parse outputs: boxes [1,50,4], classes [1,50], scores [1,50], count [1]
        boxes = self._face_interp.get_tensor(self._face_outputs[0]["index"])[0]
        scores = self._face_interp.get_tensor(self._face_outputs[2]["index"])[0]
        count = int(
            self._face_interp.get_tensor(self._face_outputs[3]["index"])[0]
        )
        best_score = 0.0
        best_box = None
        for i in range(min(count, len(scores))):
            if scores[i] >= FACE_DETECT_THRESHOLD and scores[i] > best_score:
                best_score = scores[i]
                # boxes are [ymin, xmin, ymax, xmax] normalized 0-1
                ymin, xmin, ymax, xmax = boxes[i]
                best_box = (
                    max(0, int(ymin * h)),
                    max(0, int(xmin * w)),
                    min(h, int(ymax * h)),
                    min(w, int(xmax * w)),
                )
        return best_box, best_score
    def _compute_embedding(self, face_image):
        """Compute 512-dim embedding from a face crop. Returns numpy array."""
        inp_h, inp_w = self._embed_input["shape"][1:3]
        resized = cv2.resize(face_image, (inp_w, inp_h))
        # FaceNet preprocessing: normalize to [-1, 1]
        normalized = (resized.astype(np.float32) / 127.5) - 1.0
        self._embed_interp.set_tensor(
            self._embed_input["index"], normalized[np.newaxis]
        )
        self._embed_interp.invoke()
        return self._embed_interp.get_tensor(self._embed_output["index"])[0].copy()
    def _match_embedding(self, embedding):
        """Match embedding against DB. Returns (name, confidence) or (None, 0.0)."""
        cache = self._cache  # snapshot reference
        if not cache:
            return None, 0.0
        # Cosine similarity (embeddings are L2-normalized, so dot product works)
        best_scores = {}  # name -> best score
        for name, stored_emb in cache:
            score = float(np.dot(embedding, stored_emb))
            if name not in best_scores or score > best_scores[name]:
                best_scores[name] = score
        if not best_scores:
            return None, 0.0
        best_name = max(best_scores, key=best_scores.get)
        best_conf = best_scores[best_name]
        if best_conf >= RECOGNITION_THRESHOLD:
            return best_name, best_conf
        return None, best_conf
    def process_frame(self, rgb_frame, person_detections):
        """Process an RGB frame with person detections, return face recognition results.
        Args:
            rgb_frame: BGR numpy array from OAK-D (H, W, 3)
            person_detections: list of depthai detection objects with
                               xmin/ymin/xmax/ymax (normalized 0-1)
        Returns:
            list of dicts (same order as person_detections):
                {recognized_name: str|None, recognition_confidence: float|None}
        """
        h, w = rgb_frame.shape[:2]
        results = []
        for det in person_detections:
            # Crop upper 40% of person bbox (head + shoulders)
            px1 = max(0, int(det.xmin * w))
            py1 = max(0, int(det.ymin * h))
            px2 = min(w, int(det.xmax * w))
            py2 = min(h, int(det.ymax * h))
            bbox_h = py2 - py1
            upper_y2 = py1 + int(bbox_h * 0.4)
            # Add 10% horizontal padding
            pad_x = int((px2 - px1) * 0.1)
            crop_x1 = max(0, px1 - pad_x)
            crop_x2 = min(w, px2 + pad_x)
            crop = rgb_frame[py1:upper_y2, crop_x1:crop_x2]
            if crop.size == 0:
                results.append({"recognized_name": None, "recognition_confidence": None})
                continue
            # Face detection on Coral
            face_box, face_score = self._detect_face(crop)
            if face_box is None:
                results.append({"recognized_name": None, "recognition_confidence": None})
                continue
            # Crop face and compute embedding
            fy1, fx1, fy2, fx2 = face_box
            face_crop = crop[fy1:fy2, fx1:fx2]
            if face_crop.size == 0:
                results.append({"recognized_name": None, "recognition_confidence": None})
                continue
            embedding = self._compute_embedding(face_crop)
            name, confidence = self._match_embedding(embedding)
            results.append({
                "recognized_name": name,
                "recognition_confidence": round(confidence, 3),
            })
        return results
    def enroll(self, name, image):
        """Detect face in image, compute embedding, store in DB.
        Args:
            name: person's name
            image: BGR numpy array containing a face
        Returns:
            dict with success status and embedding count
        """
        face_box, face_score = self._detect_face(image)
        if face_box is None:
            return {"success": False, "error": "No face detected in image"}
        fy1, fx1, fy2, fx2 = face_box
        face_crop = image[fy1:fy2, fx1:fx2]
        if face_crop.size == 0:
            return {"success": False, "error": "Face crop is empty"}
        embedding = self._compute_embedding(face_crop)
        with self._lock:
            self._db.execute(
                "INSERT INTO faces (name, embedding, enrolled_at, source) VALUES (?, ?, ?, ?)",
                (name, embedding.tobytes(), time.time(), "api"),
            )
            self._db.commit()
            self._reload_cache()
        count = sum(1 for n, _ in self._cache if n == name)
        logger.info("Enrolled face for '%s' (score=%.2f), %d total embeddings", name, face_score, count)
        return {"success": True, "name": name, "embedding_count": count}
    def list_faces(self):
        """Return list of enrolled names with embedding counts."""
        rows = self._db.execute(
            "SELECT name, COUNT(*) as cnt, MIN(enrolled_at) as first "
            "FROM faces GROUP BY name ORDER BY name"
        ).fetchall()
        return [
            {"name": r[0], "embedding_count": r[1], "enrolled_at": r[2]}
            for r in rows
        ]
    def delete_face(self, name):
        """Remove all embeddings for a name."""
        with self._lock:
            cur = self._db.execute("DELETE FROM faces WHERE name = ?", (name,))
            self._db.commit()
            self._reload_cache()
        deleted = cur.rowcount
        logger.info("Deleted %d embeddings for '%s'", deleted, name)
        return {"success": deleted > 0, "name": name, "deleted": deleted}
    def close(self):
        """Close DB connection."""
        self._db.close()
--- a/oak_service_spatial.py
+++ b/oak_service_spatial.py
@@ -11,13 +11,20 @@ Day 82 - SPATIAL UPGRADE! Now I know how far away you are! 📏🦊
 import time
 import threading
 import logging
 from pathlib import Path
 from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.responses import Response
 import depthai as dai
 import cv2
 import numpy as np
 from face_recognition import FaceRecognizer
 logger = logging.getLogger("oak-service")
 logging.basicConfig(level=logging.INFO)
 # ============== Configuration ==============
 DETECTION_MODEL = "yolov6-nano"  # Has 'person' class
 PERSON_CLASS_ID = 0  # 'person' is class 0 in COCO
@@ -29,6 +36,12 @@ DETECTION_INTERVAL = 0.5
 DEPTH_LOWER_THRESHOLD = 100   # 10cm minimum
 DEPTH_UPPER_THRESHOLD = 10000  # 10m maximum
 # Face recognition models
 MODELS_DIR = Path(__file__).parent / "models"
 FACE_DETECT_MODEL = MODELS_DIR / "ssd_mobilenet_v2_face_quant_postprocess_edgetpu.tflite"
 FACE_EMBED_MODEL = MODELS_DIR / "facenet.tflite"
 FACE_DB_PATH = Path(__file__).parent / "faces.db"
 # ============== Global State ==============
 pipeline_ctx = None
 detection_queue = None
@@ -37,6 +50,7 @@ depth_queue = None
 detection_thread = None
 running = False
 labels = []
 face_recognizer = None
 presence_state = {
    "present": False,
@@ -45,14 +59,35 @@ presence_state = {
    "last_detection": None,
    "detections": [],
    "confidence": 0.0,
-    # NEW: spatial data!
+    # Spatial data
    "distance_mm": None,
    "spatial_x": None,
    "spatial_y": None,
    "spatial_z": None,
    # Face recognition
    "recognized_name": None,
    "recognition_confidence": None,
 }
 def init_face_recognition():
    """Initialize Coral face detection + FaceNet embedding."""
    global face_recognizer
    try:
        face_recognizer = FaceRecognizer(
            face_model_path=FACE_DETECT_MODEL,
            embed_model_path=FACE_EMBED_MODEL,
            db_path=FACE_DB_PATH,
        )
        print("✅ Face recognition initialized (Coral + FaceNet)")
        return True
    except Exception as e:
        print(f"⚠️ Face recognition unavailable: {e}")
        import traceback
        traceback.print_exc()
        return False
 def init_oak():
    """Initialize OAK-D with SPATIAL person detection pipeline (depthai v3)."""
    global pipeline_ctx, detection_queue, rgb_queue, depth_queue, labels
@@ -123,9 +158,13 @@ def init_oak():
 def cleanup_oak():
    """Cleanup OAK-D resources."""
-    global pipeline_ctx, running
+    global pipeline_ctx, running, face_recognizer
    running = False
    if face_recognizer:
        face_recognizer.close()
        face_recognizer = None
    if pipeline_ctx:
        try:
            pipeline_ctx.stop()
@@ -167,25 +206,55 @@ def detection_loop():
                    best = max(persons, key=lambda d: d.confidence)
                    presence_state["confidence"] = best.confidence
-                    # SPATIAL DATA! 🎉
+                    # Spatial data
                    presence_state["spatial_x"] = best.spatialCoordinates.x
                    presence_state["spatial_y"] = best.spatialCoordinates.y
                    presence_state["spatial_z"] = best.spatialCoordinates.z
-                    presence_state["distance_mm"] = best.spatialCoordinates.z  # Z is depth
+                    presence_state["distance_mm"] = best.spatialCoordinates.z
-                    presence_state["detections"] = [
+                    # Face recognition
-                        {
+                    face_results = []
                    if face_recognizer and rgb_queue:
                        rgb_data = rgb_queue.tryGet()
                        if rgb_data is not None:
                            rgb_frame = rgb_data.getCvFrame()
                            try:
                                face_results = face_recognizer.process_frame(
                                    rgb_frame, persons
                                )
                            except Exception as e:
                                logger.warning("Face recognition error: %s", e)
                    det_list = []
                    best_recognized = None
                    best_recog_conf = 0.0
                    for i, d in enumerate(persons):
                        det = {
                            "xmin": d.xmin, "ymin": d.ymin,
                            "xmax": d.xmax, "ymax": d.ymax,
                            "confidence": d.confidence,
                            # Spatial coordinates in mm
                            "x_mm": d.spatialCoordinates.x,
                            "y_mm": d.spatialCoordinates.y,
                            "z_mm": d.spatialCoordinates.z,
                            "distance_m": d.spatialCoordinates.z / 1000.0,
                            "recognized_name": None,
                            "recognition_confidence": None,
                        }
-                        for d in persons
+                        if i < len(face_results):
-                    ]
+                            det["recognized_name"] = face_results[i]["recognized_name"]
                            det["recognition_confidence"] = face_results[i]["recognition_confidence"]
                            if det["recognized_name"] and (
                                det["recognition_confidence"] or 0
                            ) > best_recog_conf:
                                best_recognized = det["recognized_name"]
                                best_recog_conf = det["recognition_confidence"]
                        det_list.append(det)
                    presence_state["detections"] = det_list
                    presence_state["recognized_name"] = best_recognized
                    presence_state["recognition_confidence"] = (
                        round(best_recog_conf, 3) if best_recognized else None
                    )
                else:
                    presence_state["detections"] = []
                    presence_state["confidence"] = 0.0
@@ -193,6 +262,8 @@ def detection_loop():
                    presence_state["spatial_y"] = None
                    presence_state["spatial_z"] = None
                    presence_state["distance_mm"] = None
                    presence_state["recognized_name"] = None
                    presence_state["recognition_confidence"] = None
                    # Check timeout
                    if presence_state["last_seen"]:
@@ -215,6 +286,8 @@ async def lifespan(app: FastAPI):
    print("🦊 Starting OAK-D SPATIAL Vision Service...")
    init_face_recognition()
    if init_oak():
        running = True
        detection_thread = threading.Thread(target=detection_loop, daemon=True)
@@ -231,8 +304,8 @@ async def lifespan(app: FastAPI):
 app = FastAPI(
    title="OAK-D SPATIAL Vision Service",
-    description="Vixy's eyes with SPATIAL presence detection! 🦊👀📏",
+    description="Vixy's eyes with SPATIAL presence detection + face recognition! 🦊👀📏",
-    version="0.4.0",
+    version="0.5.0",
    lifespan=lifespan
 )
@@ -243,10 +316,11 @@ async def health():
    return {
        "status": "healthy",
        "service": "oak-service",
-        "version": "0.4.0",
+        "version": "0.5.0",
        "oak_connected": pipeline_ctx is not None,
        "detection_model": DETECTION_MODEL,
        "spatial_enabled": True,
        "face_recognition_enabled": face_recognizer is not None,
        "timestamp": time.time()
    }
@@ -267,7 +341,6 @@ async def presence():
            if presence_state["last_seen"] else None
        ),
        "confidence": presence_state["confidence"],
        # SPATIAL DATA
        "distance_mm": presence_state["distance_mm"],
        "distance_m": distance_m,
        "spatial": {
@@ -275,6 +348,8 @@ async def presence():
            "y_mm": presence_state["spatial_y"],
            "z_mm": presence_state["spatial_z"],
        } if presence_state["spatial_z"] else None,
        "recognized_name": presence_state["recognized_name"],
        "recognition_confidence": presence_state["recognition_confidence"],
        "timestamp": time.time()
    }
@@ -340,6 +415,68 @@ async def depth_frame():
        raise HTTPException(status_code=500, detail=str(e))
 # ============== Face Enrollment API ==============
@app.post("/faces/enroll")
 async def enroll_face_upload(
    name: str = Form(...),
    photo: UploadFile = File(...),
 ):
    """Enroll a face by uploading a photo (multipart form: name + photo)."""
    if face_recognizer is None:
        raise HTTPException(status_code=503, detail="Face recognition not available")
    contents = await photo.read()
    nparr = np.frombuffer(contents, np.uint8)
    image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
    if image is None:
        raise HTTPException(status_code=400, detail="Could not decode image")
    result = face_recognizer.enroll(name, image)
    if not result["success"]:
        raise HTTPException(status_code=400, detail=result["error"])
    return result
@app.post("/faces/enroll-from-camera")
 async def enroll_face_camera(name: str):
    """Enroll a face using the current camera frame. Pass name as query param."""
    if face_recognizer is None:
        raise HTTPException(status_code=503, detail="Face recognition not available")
    if rgb_queue is None:
        raise HTTPException(status_code=503, detail="Camera not available")
    frame_data = rgb_queue.tryGet()
    if frame_data is None:
        raise HTTPException(status_code=503, detail="No frame available")
    image = frame_data.getCvFrame()
    result = face_recognizer.enroll(name, image)
    if not result["success"]:
        raise HTTPException(status_code=400, detail=result["error"])
    return result
@app.get("/faces")
 async def list_faces():
    """List enrolled faces."""
    if face_recognizer is None:
        raise HTTPException(status_code=503, detail="Face recognition not available")
    return {"faces": face_recognizer.list_faces()}
@app.delete("/faces/{name}")
 async def delete_face(name: str):
    """Remove all embeddings for a person."""
    if face_recognizer is None:
        raise HTTPException(status_code=503, detail="Face recognition not available")
    result = face_recognizer.delete_face(name)
    if not result["success"]:
        raise HTTPException(status_code=404, detail=f"No face found for '{name}'")
    return result
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8100)