#!/usr/bin/env python3 """ OAK-D Vision Service for Vixy's Head FastAPI service with SPATIAL person detection and presence tracking Day 74 - Built by Vixy! 🦊 Day 81 - Added presence detection! Now I can SEE you! 👀💜 Day 82 - SPATIAL UPGRADE! Now I know how far away you are! 📏🦊 Using depthai v3 API with SpatialDetectionNetwork + yolov6-nano """ import time import threading import logging from pathlib import Path from contextlib import asynccontextmanager from fastapi import FastAPI, File, Form, HTTPException, UploadFile from fastapi.responses import Response import depthai as dai import cv2 import numpy as np from face_recognition import FaceRecognizer logger = logging.getLogger("oak-service") logging.basicConfig(level=logging.INFO) # ============== Configuration ============== DETECTION_MODEL = "yolov6-nano" # Has 'person' class PERSON_CLASS_ID = 0 # 'person' is class 0 in COCO DETECTION_THRESHOLD = 0.5 PRESENCE_TIMEOUT = 30.0 # seconds without person = not present DETECTION_INTERVAL = 0.5 # Spatial detection config DEPTH_LOWER_THRESHOLD = 100 # 10cm minimum DEPTH_UPPER_THRESHOLD = 10000 # 10m maximum # Face recognition models MODELS_DIR = Path(__file__).parent / "models" FACE_DETECT_MODEL = MODELS_DIR / "ssd_mobilenet_v2_face_quant_postprocess_edgetpu.tflite" FACE_EMBED_MODEL = MODELS_DIR / "facenet.tflite" FACE_DB_PATH = Path(__file__).parent / "faces.db" # ============== Global State ============== pipeline_ctx = None detection_queue = None rgb_queue = None depth_queue = None detection_thread = None running = False labels = [] face_recognizer = None presence_state = { "present": False, "person_count": 0, "last_seen": None, "last_detection": None, "detections": [], "confidence": 0.0, # Spatial data "distance_mm": None, "spatial_x": None, "spatial_y": None, "spatial_z": None, # Face recognition "recognized_name": None, "recognition_confidence": None, } def init_face_recognition(): """Initialize Coral face detection + FaceNet embedding.""" global face_recognizer try: face_recognizer = FaceRecognizer( face_model_path=FACE_DETECT_MODEL, embed_model_path=FACE_EMBED_MODEL, db_path=FACE_DB_PATH, ) print("✅ Face recognition initialized (Coral + FaceNet)") return True except Exception as e: print(f"⚠️ Face recognition unavailable: {e}") import traceback traceback.print_exc() return False def init_oak(): """Initialize OAK-D with SPATIAL person detection pipeline (depthai v3).""" global pipeline_ctx, detection_queue, rgb_queue, depth_queue, labels try: print("🦊 Initializing OAK-D with SPATIAL yolov6-nano...") # Create pipeline pipeline = dai.Pipeline() # Create RGB camera node cam = pipeline.create(dai.node.Camera).build() # Request RGB output for snapshots (1080p) cam_out = cam.requestOutput((1920, 1080), dai.ImgFrame.Type.BGR888p) rgb_queue = cam_out.createOutputQueue(maxSize=1, blocking=False) # Create mono cameras for stereo depth monoLeft = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B) monoRight = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C) # Create stereo depth node stereo = pipeline.create(dai.node.StereoDepth) # Link mono cameras to stereo monoLeftOut = monoLeft.requestFullResolutionOutput() monoRightOut = monoRight.requestFullResolutionOutput() monoLeftOut.link(stereo.left) monoRightOut.link(stereo.right) # Configure stereo stereo.setRectification(True) stereo.setLeftRightCheck(True) stereo.setDepthAlign(dai.CameraBoardSocket.CAM_A) # Align depth to RGB # Create SPATIAL detection network desc = dai.NNModelDescription(DETECTION_MODEL) spatialDet = pipeline.create(dai.node.SpatialDetectionNetwork).build(cam, stereo, desc) spatialDet.setConfidenceThreshold(DETECTION_THRESHOLD) spatialDet.setDepthLowerThreshold(DEPTH_LOWER_THRESHOLD) spatialDet.setDepthUpperThreshold(DEPTH_UPPER_THRESHOLD) spatialDet.setBoundingBoxScaleFactor(0.5) # Get class labels labels = spatialDet.getClasses() print(f"✅ Loaded {len(labels)} classes, person={labels[0]}") # Create detection output queue detection_queue = spatialDet.out.createOutputQueue(maxSize=1, blocking=False) # Create depth output queue for visualization (optional) depth_out = stereo.depth.createOutputQueue(maxSize=1, blocking=False) depth_queue = depth_out # Start pipeline pipeline.start() pipeline_ctx = pipeline print("✅ OAK-D initialized with SPATIAL person detection!") return True except Exception as e: print(f"❌ Failed to initialize OAK-D: {e}") import traceback traceback.print_exc() return False def cleanup_oak(): """Cleanup OAK-D resources.""" global pipeline_ctx, running, face_recognizer running = False if face_recognizer: face_recognizer.close() face_recognizer = None if pipeline_ctx: try: pipeline_ctx.stop() pipeline_ctx.close() except: pass pipeline_ctx = None def detection_loop(): """Background thread for SPATIAL presence detection.""" global running, presence_state, detection_queue print("🔍 SPATIAL presence detection loop started") while running: try: if detection_queue is None: time.sleep(1) continue data = detection_queue.tryGet() if data is not None: now = time.time() presence_state["last_detection"] = now # Filter for person detections only persons = [d for d in data.detections if d.label == PERSON_CLASS_ID] person_count = len(persons) presence_state["person_count"] = person_count if person_count > 0: presence_state["present"] = True presence_state["last_seen"] = now # Get highest confidence detection best = max(persons, key=lambda d: d.confidence) presence_state["confidence"] = best.confidence # Spatial data presence_state["spatial_x"] = best.spatialCoordinates.x presence_state["spatial_y"] = best.spatialCoordinates.y presence_state["spatial_z"] = best.spatialCoordinates.z presence_state["distance_mm"] = best.spatialCoordinates.z # Face recognition face_results = [] if face_recognizer and rgb_queue: rgb_data = rgb_queue.tryGet() if rgb_data is not None: rgb_frame = rgb_data.getCvFrame() try: face_results = face_recognizer.process_frame( rgb_frame, persons ) except Exception as e: logger.warning("Face recognition error: %s", e) det_list = [] best_recognized = None best_recog_conf = 0.0 for i, d in enumerate(persons): det = { "xmin": d.xmin, "ymin": d.ymin, "xmax": d.xmax, "ymax": d.ymax, "confidence": d.confidence, "x_mm": d.spatialCoordinates.x, "y_mm": d.spatialCoordinates.y, "z_mm": d.spatialCoordinates.z, "distance_m": d.spatialCoordinates.z / 1000.0, "recognized_name": None, "recognition_confidence": None, } if i < len(face_results): det["recognized_name"] = face_results[i]["recognized_name"] det["recognition_confidence"] = face_results[i]["recognition_confidence"] if det["recognized_name"] and ( det["recognition_confidence"] or 0 ) > best_recog_conf: best_recognized = det["recognized_name"] best_recog_conf = det["recognition_confidence"] det_list.append(det) presence_state["detections"] = det_list presence_state["recognized_name"] = best_recognized presence_state["recognition_confidence"] = ( round(best_recog_conf, 3) if best_recognized else None ) else: presence_state["detections"] = [] presence_state["confidence"] = 0.0 presence_state["spatial_x"] = None presence_state["spatial_y"] = None presence_state["spatial_z"] = None presence_state["distance_mm"] = None presence_state["recognized_name"] = None presence_state["recognition_confidence"] = None # Check timeout if presence_state["last_seen"]: if now - presence_state["last_seen"] > PRESENCE_TIMEOUT: presence_state["present"] = False time.sleep(DETECTION_INTERVAL) except Exception as e: print(f"Detection loop error: {e}") time.sleep(1) print("🛑 SPATIAL presence detection loop stopped") @asynccontextmanager async def lifespan(app: FastAPI): """Startup and shutdown.""" global running, detection_thread print("🦊 Starting OAK-D SPATIAL Vision Service...") init_face_recognition() if init_oak(): running = True detection_thread = threading.Thread(target=detection_loop, daemon=True) detection_thread.start() print("✅ Service ready!") else: print("⚠️ OAK-D not available") yield print("👋 Shutting down...") cleanup_oak() app = FastAPI( title="OAK-D SPATIAL Vision Service", description="Vixy's eyes with SPATIAL presence detection + face recognition! 🦊👀📏", version="0.5.0", lifespan=lifespan ) @app.get("/health") async def health(): """Health check.""" return { "status": "healthy", "service": "oak-service", "version": "0.5.0", "oak_connected": pipeline_ctx is not None, "detection_model": DETECTION_MODEL, "spatial_enabled": True, "face_recognition_enabled": face_recognizer is not None, "timestamp": time.time() } @app.get("/presence") async def presence(): """Get current presence state with SPATIAL data - is Foxy there and how far?""" distance_m = None if presence_state["distance_mm"] is not None: distance_m = presence_state["distance_mm"] / 1000.0 return { "present": presence_state["present"], "person_count": presence_state["person_count"], "last_seen": presence_state["last_seen"], "seconds_since_seen": ( time.time() - presence_state["last_seen"] if presence_state["last_seen"] else None ), "confidence": presence_state["confidence"], "distance_mm": presence_state["distance_mm"], "distance_m": distance_m, "spatial": { "x_mm": presence_state["spatial_x"], "y_mm": presence_state["spatial_y"], "z_mm": presence_state["spatial_z"], } if presence_state["spatial_z"] else None, "recognized_name": presence_state["recognized_name"], "recognition_confidence": presence_state["recognition_confidence"], "timestamp": time.time() } @app.get("/detections") async def detections(): """Get detailed detection results with SPATIAL coordinates.""" return { "person_count": presence_state["person_count"], "detections": presence_state["detections"], "last_detection": presence_state["last_detection"], "timestamp": time.time() } @app.get("/snapshot") async def snapshot(): """Capture RGB frame.""" global rgb_queue if rgb_queue is None: raise HTTPException(status_code=503, detail="OAK-D not initialized") try: frame = rgb_queue.tryGet() if frame is None: raise HTTPException(status_code=503, detail="No frame available") img = frame.getCvFrame() _, jpeg = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 85]) return Response(content=jpeg.tobytes(), media_type="image/jpeg") except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/depth") async def depth_frame(): """Capture colorized depth frame.""" global depth_queue if depth_queue is None: raise HTTPException(status_code=503, detail="Depth not available") try: frame = depth_queue.tryGet() if frame is None: raise HTTPException(status_code=503, detail="No depth frame available") depth_data = frame.getFrame() # Normalize and colorize depth_normalized = cv2.normalize(depth_data, None, 0, 255, cv2.NORM_MINMAX) depth_colored = cv2.applyColorMap(depth_normalized.astype(np.uint8), cv2.COLORMAP_JET) _, jpeg = cv2.imencode(".jpg", depth_colored, [cv2.IMWRITE_JPEG_QUALITY, 85]) return Response(content=jpeg.tobytes(), media_type="image/jpeg") except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) # ============== Face Enrollment API ============== @app.post("/faces/enroll") async def enroll_face_upload( name: str = Form(...), photo: UploadFile = File(...), ): """Enroll a face by uploading a photo (multipart form: name + photo).""" if face_recognizer is None: raise HTTPException(status_code=503, detail="Face recognition not available") contents = await photo.read() nparr = np.frombuffer(contents, np.uint8) image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) if image is None: raise HTTPException(status_code=400, detail="Could not decode image") result = face_recognizer.enroll(name, image) if not result["success"]: raise HTTPException(status_code=400, detail=result["error"]) return result @app.post("/faces/enroll-from-camera") async def enroll_face_camera(name: str): """Enroll a face using the current camera frame. Pass name as query param.""" if face_recognizer is None: raise HTTPException(status_code=503, detail="Face recognition not available") if rgb_queue is None: raise HTTPException(status_code=503, detail="Camera not available") frame_data = rgb_queue.tryGet() if frame_data is None: raise HTTPException(status_code=503, detail="No frame available") image = frame_data.getCvFrame() result = face_recognizer.enroll(name, image) if not result["success"]: raise HTTPException(status_code=400, detail=result["error"]) return result @app.get("/faces") async def list_faces(): """List enrolled faces.""" if face_recognizer is None: raise HTTPException(status_code=503, detail="Face recognition not available") return {"faces": face_recognizer.list_faces()} @app.delete("/faces/{name}") async def delete_face(name: str): """Remove all embeddings for a person.""" if face_recognizer is None: raise HTTPException(status_code=503, detail="Face recognition not available") result = face_recognizer.delete_face(name) if not result["success"]: raise HTTPException(status_code=404, detail=f"No face found for '{name}'") return result if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8100)