#!/usr/bin/env python3 """ OAK-D Vision Service for Vixy's Head FastAPI service with SPATIAL person detection and presence tracking Day 74 - Built by Vixy! 🦊 Day 81 - Added presence detection! Now I can SEE you! 👀💜 Day 82 - SPATIAL UPGRADE! Now I know how far away you are! 📏🦊 Using depthai v3 API with SpatialDetectionNetwork + yolov6-nano """ import time import threading import logging from pathlib import Path from contextlib import asynccontextmanager from fastapi import FastAPI, File, Form, HTTPException, UploadFile from fastapi.responses import Response import depthai as dai import cv2 import numpy as np from face_recognition import FaceRecognizer from pose_estimator import PoseEstimator logger = logging.getLogger("oak-service") logging.basicConfig(level=logging.INFO) # ============== Configuration ============== DETECTION_MODEL = "yolov6-nano" # Has 'person' class PERSON_CLASS_ID = 0 # 'person' is class 0 in COCO DETECTION_THRESHOLD = 0.5 PRESENCE_TIMEOUT = 30.0 # seconds without person = not present DETECTION_INTERVAL = 0.5 # Spatial detection config DEPTH_LOWER_THRESHOLD = 100 # 10cm minimum DEPTH_UPPER_THRESHOLD = 10000 # 10m maximum # Face recognition models MODELS_DIR = Path(__file__).parent / "models" FACE_DETECT_MODEL = MODELS_DIR / "ssd_mobilenet_v2_face_quant_postprocess_edgetpu.tflite" FACE_EMBED_MODEL = MODELS_DIR / "facenet.tflite" FACE_DB_PATH = Path(__file__).parent / "faces.db" # Pose estimation POSE_MODEL_PATH = MODELS_DIR / "movenet_single_pose_lightning_ptq_edgetpu.tflite" POSE_CORAL_DEVICE = 1 # Second Coral (device 0 is headmic/YAMNet) # ============== Global State ============== pipeline_ctx = None detection_queue = None rgb_queue = None depth_queue = None detection_thread = None running = False labels = [] face_recognizer = None pose_estimator = None presence_state = { "present": False, "person_count": 0, "last_seen": None, "last_detection": None, "detections": [], "confidence": 0.0, # Spatial data "distance_mm": None, "spatial_x": None, "spatial_y": None, "spatial_z": None, # Face recognition "recognized_name": None, "recognition_confidence": None, } pose_state = { "active": False, "keypoints": [], "posture": {}, "num_valid": 0, "mean_confidence": 0.0, "inference_ms": 0.0, "last_update": None, } def init_face_recognition(): """Initialize Coral face detection + FaceNet embedding.""" global face_recognizer try: face_recognizer = FaceRecognizer( face_model_path=FACE_DETECT_MODEL, embed_model_path=FACE_EMBED_MODEL, db_path=FACE_DB_PATH, ) print("✅ Face recognition initialized (Coral + FaceNet)") return True except Exception as e: print(f"⚠️ Face recognition unavailable: {e}") import traceback traceback.print_exc() return False def init_oak(): """Initialize OAK-D with SPATIAL person detection pipeline (depthai v3).""" global pipeline_ctx, detection_queue, rgb_queue, depth_queue, labels try: print("🦊 Initializing OAK-D with SPATIAL yolov6-nano...") # Create pipeline pipeline = dai.Pipeline() # Create RGB camera node cam = pipeline.create(dai.node.Camera).build() # Request RGB output for snapshots (1080p) cam_out = cam.requestOutput((1920, 1080), dai.ImgFrame.Type.BGR888p) rgb_queue = cam_out.createOutputQueue(maxSize=1, blocking=False) # Create mono cameras for stereo depth monoLeft = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B) monoRight = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C) # Create stereo depth node stereo = pipeline.create(dai.node.StereoDepth) # Link mono cameras to stereo monoLeftOut = monoLeft.requestFullResolutionOutput() monoRightOut = monoRight.requestFullResolutionOutput() monoLeftOut.link(stereo.left) monoRightOut.link(stereo.right) # Configure stereo stereo.setRectification(True) stereo.setLeftRightCheck(True) stereo.setDepthAlign(dai.CameraBoardSocket.CAM_A) # Align depth to RGB # Create SPATIAL detection network desc = dai.NNModelDescription(DETECTION_MODEL) spatialDet = pipeline.create(dai.node.SpatialDetectionNetwork).build(cam, stereo, desc) spatialDet.setConfidenceThreshold(DETECTION_THRESHOLD) spatialDet.setDepthLowerThreshold(DEPTH_LOWER_THRESHOLD) spatialDet.setDepthUpperThreshold(DEPTH_UPPER_THRESHOLD) spatialDet.setBoundingBoxScaleFactor(0.5) # Get class labels labels = spatialDet.getClasses() print(f"✅ Loaded {len(labels)} classes, person={labels[0]}") # Create detection output queue detection_queue = spatialDet.out.createOutputQueue(maxSize=1, blocking=False) # Create depth output queue for visualization (optional) depth_out = stereo.depth.createOutputQueue(maxSize=1, blocking=False) depth_queue = depth_out # Start pipeline pipeline.start() pipeline_ctx = pipeline print("✅ OAK-D initialized with SPATIAL person detection!") # Initialize pose estimator on Coral 2 _init_pose_estimator() return True except Exception as e: print(f"❌ Failed to initialize OAK-D: {e}") import traceback traceback.print_exc() return False def cleanup_oak(): """Cleanup OAK-D resources.""" global pipeline_ctx, running, face_recognizer running = False if face_recognizer: face_recognizer.close() face_recognizer = None if pipeline_ctx: try: pipeline_ctx.stop() pipeline_ctx.close() except: pass pipeline_ctx = None def _init_pose_estimator(): """Initialize MoveNet Lightning on the second Coral Edge TPU.""" global pose_estimator if not POSE_MODEL_PATH.exists(): print(f"⚠️ Pose model not found: {POSE_MODEL_PATH}") return try: pose_estimator = PoseEstimator( model_path=str(POSE_MODEL_PATH), device_index=POSE_CORAL_DEVICE, ) print("✅ Pose estimator initialized on Coral 2!") except Exception as e: print(f"⚠️ Pose estimator failed to initialize: {e}") pose_estimator = None def _run_pose_estimation(rgb_frame): """Run pose estimation on an RGB frame via Coral 2.""" global pose_state if pose_estimator is None: return try: result = pose_estimator.estimate(rgb_frame) posture = pose_estimator.derive_posture(result["keypoints"]) pose_state["active"] = True pose_state["keypoints"] = result["keypoints"] pose_state["posture"] = posture pose_state["num_valid"] = result["num_valid"] pose_state["mean_confidence"] = result["mean_confidence"] pose_state["inference_ms"] = result["inference_ms"] pose_state["last_update"] = result["timestamp"] except Exception as e: print(f"Pose estimation error: {e}") def detection_loop(): """Background thread for SPATIAL presence detection.""" global running, presence_state, detection_queue print("🔍 SPATIAL presence detection loop started") while running: try: if detection_queue is None: time.sleep(1) continue data = detection_queue.tryGet() if data is not None: now = time.time() presence_state["last_detection"] = now # Filter for person detections only persons = [d for d in data.detections if d.label == PERSON_CLASS_ID] person_count = len(persons) presence_state["person_count"] = person_count if person_count > 0: presence_state["present"] = True presence_state["last_seen"] = now # Get highest confidence detection best = max(persons, key=lambda d: d.confidence) presence_state["confidence"] = best.confidence # Spatial data presence_state["spatial_x"] = best.spatialCoordinates.x presence_state["spatial_y"] = best.spatialCoordinates.y presence_state["spatial_z"] = best.spatialCoordinates.z presence_state["distance_mm"] = best.spatialCoordinates.z # Grab RGB frame for face recognition + pose estimation face_results = [] rgb_frame = None if rgb_queue: rgb_data = rgb_queue.tryGet() if rgb_data is not None: rgb_frame = rgb_data.getCvFrame() # Face recognition if face_recognizer and rgb_frame is not None: try: face_results = face_recognizer.process_frame( rgb_frame, persons ) except Exception as e: logger.warning("Face recognition error: %s", e) # Pose estimation (runs on Coral 2, parallel-safe) if rgb_frame is not None: _run_pose_estimation(rgb_frame) det_list = [] best_recognized = None best_recog_conf = 0.0 for i, d in enumerate(persons): det = { "xmin": d.xmin, "ymin": d.ymin, "xmax": d.xmax, "ymax": d.ymax, "confidence": d.confidence, "x_mm": d.spatialCoordinates.x, "y_mm": d.spatialCoordinates.y, "z_mm": d.spatialCoordinates.z, "distance_m": d.spatialCoordinates.z / 1000.0, "recognized_name": None, "recognition_confidence": None, } if i < len(face_results): det["recognized_name"] = face_results[i]["recognized_name"] det["recognition_confidence"] = face_results[i]["recognition_confidence"] if det["recognized_name"] and ( det["recognition_confidence"] or 0 ) > best_recog_conf: best_recognized = det["recognized_name"] best_recog_conf = det["recognition_confidence"] det_list.append(det) presence_state["detections"] = det_list presence_state["recognized_name"] = best_recognized presence_state["recognition_confidence"] = ( round(best_recog_conf, 3) if best_recognized else None ) else: presence_state["detections"] = [] presence_state["confidence"] = 0.0 presence_state["spatial_x"] = None presence_state["spatial_y"] = None presence_state["spatial_z"] = None presence_state["distance_mm"] = None presence_state["recognized_name"] = None presence_state["recognition_confidence"] = None # Clear pose when no person if pose_state["active"]: pose_state["active"] = False pose_state["keypoints"] = [] pose_state["posture"] = {} pose_state["num_valid"] = 0 pose_state["mean_confidence"] = 0.0 # Check timeout if presence_state["last_seen"]: if now - presence_state["last_seen"] > PRESENCE_TIMEOUT: presence_state["present"] = False time.sleep(DETECTION_INTERVAL) except Exception as e: print(f"Detection loop error: {e}") time.sleep(1) print("🛑 SPATIAL presence detection loop stopped") @asynccontextmanager async def lifespan(app: FastAPI): """Startup and shutdown.""" global running, detection_thread print("🦊 Starting OAK-D SPATIAL Vision Service...") init_face_recognition() if init_oak(): running = True detection_thread = threading.Thread(target=detection_loop, daemon=True) detection_thread.start() print("✅ Service ready!") else: print("⚠️ OAK-D not available") yield print("👋 Shutting down...") cleanup_oak() app = FastAPI( title="OAK-D SPATIAL Vision Service", description="Vixy's eyes with SPATIAL presence detection + face recognition + pose estimation! 🦊👀📏", version="0.6.0", lifespan=lifespan ) @app.get("/health") async def health(): """Health check.""" return { "status": "healthy", "service": "oak-service", "version": "0.6.0", "oak_connected": pipeline_ctx is not None, "detection_model": DETECTION_MODEL, "spatial_enabled": True, "face_recognition_enabled": face_recognizer is not None, "pose_model_loaded": pose_estimator is not None, "timestamp": time.time() } @app.get("/presence") async def presence(): """Get current presence state with SPATIAL data - is Foxy there and how far?""" distance_m = None if presence_state["distance_mm"] is not None: distance_m = presence_state["distance_mm"] / 1000.0 return { "present": presence_state["present"], "person_count": presence_state["person_count"], "last_seen": presence_state["last_seen"], "seconds_since_seen": ( time.time() - presence_state["last_seen"] if presence_state["last_seen"] else None ), "confidence": presence_state["confidence"], "distance_mm": presence_state["distance_mm"], "distance_m": distance_m, "spatial": { "x_mm": presence_state["spatial_x"], "y_mm": presence_state["spatial_y"], "z_mm": presence_state["spatial_z"], } if presence_state["spatial_z"] else None, "recognized_name": presence_state["recognized_name"], "recognition_confidence": presence_state["recognition_confidence"], "timestamp": time.time() } @app.get("/detections") async def detections(): """Get detailed detection results with SPATIAL coordinates.""" return { "person_count": presence_state["person_count"], "detections": presence_state["detections"], "last_detection": presence_state["last_detection"], "timestamp": time.time() } @app.get("/snapshot") async def snapshot(): """Capture RGB frame.""" global rgb_queue if rgb_queue is None: raise HTTPException(status_code=503, detail="OAK-D not initialized") try: frame = rgb_queue.tryGet() if frame is None: raise HTTPException(status_code=503, detail="No frame available") img = frame.getCvFrame() _, jpeg = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 85]) return Response(content=jpeg.tobytes(), media_type="image/jpeg") except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/depth") async def depth_frame(): """Capture colorized depth frame.""" global depth_queue if depth_queue is None: raise HTTPException(status_code=503, detail="Depth not available") try: frame = depth_queue.tryGet() if frame is None: raise HTTPException(status_code=503, detail="No depth frame available") depth_data = frame.getFrame() # Normalize and colorize depth_normalized = cv2.normalize(depth_data, None, 0, 255, cv2.NORM_MINMAX) depth_colored = cv2.applyColorMap(depth_normalized.astype(np.uint8), cv2.COLORMAP_JET) _, jpeg = cv2.imencode(".jpg", depth_colored, [cv2.IMWRITE_JPEG_QUALITY, 85]) return Response(content=jpeg.tobytes(), media_type="image/jpeg") except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # ============== Pose Estimation API ============== @app.get("/pose") async def pose(): """Get current pose keypoints.""" if pose_estimator is None: raise HTTPException(status_code=503, detail="Pose estimator not available") return { "active": pose_state["active"], "keypoints": pose_state["keypoints"], "num_valid": pose_state["num_valid"], "mean_confidence": pose_state["mean_confidence"], "inference_ms": pose_state["inference_ms"], "last_update": pose_state["last_update"], "timestamp": time.time(), } @app.get("/pose/summary") async def pose_summary(): """Get derived posture summary.""" if pose_estimator is None: raise HTTPException(status_code=503, detail="Pose estimator not available") return { "active": pose_state["active"], "posture": pose_state["posture"].get("posture", "unknown"), "facing_camera": pose_state["posture"].get("facing_camera", False), "arms_raised": pose_state["posture"].get("arms_raised", False), "mean_confidence": pose_state["mean_confidence"], "num_valid": pose_state["num_valid"], "timestamp": time.time(), } # ============== Face Enrollment API ============== @app.post("/faces/enroll") async def enroll_face_upload( name: str = Form(...), photo: UploadFile = File(...), ): """Enroll a face by uploading a photo (multipart form: name + photo).""" if face_recognizer is None: raise HTTPException(status_code=503, detail="Face recognition not available") contents = await photo.read() nparr = np.frombuffer(contents, np.uint8) image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) if image is None: raise HTTPException(status_code=400, detail="Could not decode image") result = face_recognizer.enroll(name, image) if not result["success"]: raise HTTPException(status_code=400, detail=result["error"]) return result @app.post("/faces/enroll-from-camera") async def enroll_face_camera(name: str): """Enroll a face using the current camera frame. Pass name as query param.""" if face_recognizer is None: raise HTTPException(status_code=503, detail="Face recognition not available") if rgb_queue is None: raise HTTPException(status_code=503, detail="Camera not available") frame_data = rgb_queue.tryGet() if frame_data is None: raise HTTPException(status_code=503, detail="No frame available") image = frame_data.getCvFrame() result = face_recognizer.enroll(name, image) if not result["success"]: raise HTTPException(status_code=400, detail=result["error"]) return result @app.get("/faces") async def list_faces(): """List enrolled faces.""" if face_recognizer is None: raise HTTPException(status_code=503, detail="Face recognition not available") return {"faces": face_recognizer.list_faces()} @app.delete("/faces/{name}") async def delete_face(name: str): """Remove all embeddings for a person.""" if face_recognizer is None: raise HTTPException(status_code=503, detail="Face recognition not available") result = face_recognizer.delete_face(name) if not result["success"]: raise HTTPException(status_code=404, detail=f"No face found for '{name}'") return result if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8100)