oak-service/oak_service_spatial.py

#!/usr/bin/env python3
"""
OAK-D Vision Service for Vixy's Head
FastAPI service with SPATIAL person detection and presence tracking

Day 74 - Built by Vixy! 🦊
Day 81 - Added presence detection! Now I can SEE you! 👀💜
Day 82 - SPATIAL UPGRADE! Now I know how far away you are! 📏🦊
        Using depthai v3 API with SpatialDetectionNetwork + yolov6-nano
"""

import time
import threading
import logging
from pathlib import Path
from contextlib import asynccontextmanager
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi.responses import Response
import depthai as dai
import cv2
import numpy as np

from face_recognition import FaceRecognizer
from pose_estimator import PoseEstimator

logger = logging.getLogger("oak-service")
logging.basicConfig(level=logging.INFO)

# ============== Configuration ==============
DETECTION_MODEL = "yolov6-nano"  # Has 'person' class
PERSON_CLASS_ID = 0  # 'person' is class 0 in COCO
DETECTION_THRESHOLD = 0.5
PRESENCE_TIMEOUT = 30.0  # seconds without person = not present
DETECTION_INTERVAL = 0.5

# Spatial detection config
DEPTH_LOWER_THRESHOLD = 100   # 10cm minimum
DEPTH_UPPER_THRESHOLD = 10000  # 10m maximum

# Face recognition models
MODELS_DIR = Path(__file__).parent / "models"
FACE_DETECT_MODEL = MODELS_DIR / "ssd_mobilenet_v2_face_quant_postprocess_edgetpu.tflite"
FACE_EMBED_MODEL = MODELS_DIR / "facenet.tflite"
FACE_DB_PATH = Path(__file__).parent / "faces.db"

# Pose estimation
POSE_MODEL_PATH = MODELS_DIR / "movenet_single_pose_lightning_ptq_edgetpu.tflite"
POSE_CORAL_DEVICE = 1  # Second Coral (device 0 is headmic/YAMNet)

# ============== Global State ==============
pipeline_ctx = None
detection_queue = None
rgb_queue = None
depth_queue = None
detection_thread = None
running = False
labels = []
face_recognizer = None
pose_estimator = None

presence_state = {
    "present": False,
    "person_count": 0,
    "last_seen": None,
    "last_detection": None,
    "detections": [],
    "confidence": 0.0,
    # Spatial data
    "distance_mm": None,
    "spatial_x": None,
    "spatial_y": None,
    "spatial_z": None,
    # Face recognition
    "recognized_name": None,
    "recognition_confidence": None,
}

pose_state = {
    "active": False,
    "keypoints": [],
    "posture": {},
    "num_valid": 0,
    "mean_confidence": 0.0,
    "inference_ms": 0.0,
    "last_update": None,
}


def init_face_recognition():
    """Initialize Coral face detection + FaceNet embedding."""
    global face_recognizer
    try:
        face_recognizer = FaceRecognizer(
            face_model_path=FACE_DETECT_MODEL,
            embed_model_path=FACE_EMBED_MODEL,
            db_path=FACE_DB_PATH,
        )
        print("✅ Face recognition initialized (Coral + FaceNet)")
        return True
    except Exception as e:
        print(f"⚠️ Face recognition unavailable: {e}")
        import traceback
        traceback.print_exc()
        return False


def init_oak():
    """Initialize OAK-D with SPATIAL person detection pipeline (depthai v3)."""
    global pipeline_ctx, detection_queue, rgb_queue, depth_queue, labels

    try:
        print("🦊 Initializing OAK-D with SPATIAL yolov6-nano...")

        # Create pipeline
        pipeline = dai.Pipeline()

        # Create RGB camera node
        cam = pipeline.create(dai.node.Camera).build()

        # Request RGB output for snapshots (1080p)
        cam_out = cam.requestOutput((1920, 1080), dai.ImgFrame.Type.BGR888p)
        rgb_queue = cam_out.createOutputQueue(maxSize=1, blocking=False)

        # Create mono cameras for stereo depth
        monoLeft = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B)
        monoRight = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C)

        # Create stereo depth node
        stereo = pipeline.create(dai.node.StereoDepth)

        # Link mono cameras to stereo
        monoLeftOut = monoLeft.requestFullResolutionOutput()
        monoRightOut = monoRight.requestFullResolutionOutput()
        monoLeftOut.link(stereo.left)
        monoRightOut.link(stereo.right)

        # Configure stereo
        stereo.setRectification(True)
        stereo.setLeftRightCheck(True)
        stereo.setDepthAlign(dai.CameraBoardSocket.CAM_A)  # Align depth to RGB

        # Create SPATIAL detection network
        desc = dai.NNModelDescription(DETECTION_MODEL)
        spatialDet = pipeline.create(dai.node.SpatialDetectionNetwork).build(cam, stereo, desc)
        spatialDet.setConfidenceThreshold(DETECTION_THRESHOLD)
        spatialDet.setDepthLowerThreshold(DEPTH_LOWER_THRESHOLD)
        spatialDet.setDepthUpperThreshold(DEPTH_UPPER_THRESHOLD)
        spatialDet.setBoundingBoxScaleFactor(0.5)

        # Get class labels
        labels = spatialDet.getClasses()
        print(f"✅ Loaded {len(labels)} classes, person={labels[0]}")

        # Create detection output queue
        detection_queue = spatialDet.out.createOutputQueue(maxSize=1, blocking=False)

        # Create depth output queue for visualization (optional)
        depth_out = stereo.depth.createOutputQueue(maxSize=1, blocking=False)
        depth_queue = depth_out

        # Start pipeline
        pipeline.start()
        pipeline_ctx = pipeline

        print("✅ OAK-D initialized with SPATIAL person detection!")

        # Initialize pose estimator on Coral 2
        _init_pose_estimator()

        return True

    except Exception as e:
        print(f"❌ Failed to initialize OAK-D: {e}")
        import traceback
        traceback.print_exc()
        return False


def cleanup_oak():
    """Cleanup OAK-D resources."""
    global pipeline_ctx, running, face_recognizer
    running = False

    if face_recognizer:
        face_recognizer.close()
        face_recognizer = None

    if pipeline_ctx:
        try:
            pipeline_ctx.stop()
            pipeline_ctx.close()
        except:
            pass
    pipeline_ctx = None


def _init_pose_estimator():
    """Initialize MoveNet Lightning on the second Coral Edge TPU."""
    global pose_estimator

    if not POSE_MODEL_PATH.exists():
        print(f"⚠️ Pose model not found: {POSE_MODEL_PATH}")
        return

    try:
        pose_estimator = PoseEstimator(
            model_path=str(POSE_MODEL_PATH),
            device_index=POSE_CORAL_DEVICE,
        )
        print("✅ Pose estimator initialized on Coral 2!")
    except Exception as e:
        print(f"⚠️ Pose estimator failed to initialize: {e}")
        pose_estimator = None


def _run_pose_estimation(rgb_frame):
    """Run pose estimation on an RGB frame via Coral 2."""
    global pose_state

    if pose_estimator is None:
        return

    try:
        result = pose_estimator.estimate(rgb_frame)
        posture = pose_estimator.derive_posture(result["keypoints"])

        pose_state["active"] = True
        pose_state["keypoints"] = result["keypoints"]
        pose_state["posture"] = posture
        pose_state["num_valid"] = result["num_valid"]
        pose_state["mean_confidence"] = result["mean_confidence"]
        pose_state["inference_ms"] = result["inference_ms"]
        pose_state["last_update"] = result["timestamp"]

    except Exception as e:
        print(f"Pose estimation error: {e}")


def detection_loop():
    """Background thread for SPATIAL presence detection."""
    global running, presence_state, detection_queue

    print("🔍 SPATIAL presence detection loop started")

    while running:
        try:
            if detection_queue is None:
                time.sleep(1)
                continue

            data = detection_queue.tryGet()

            if data is not None:
                now = time.time()
                presence_state["last_detection"] = now

                # Filter for person detections only
                persons = [d for d in data.detections if d.label == PERSON_CLASS_ID]
                person_count = len(persons)

                presence_state["person_count"] = person_count

                if person_count > 0:
                    presence_state["present"] = True
                    presence_state["last_seen"] = now

                    # Get highest confidence detection
                    best = max(persons, key=lambda d: d.confidence)
                    presence_state["confidence"] = best.confidence

                    # Spatial data
                    presence_state["spatial_x"] = best.spatialCoordinates.x
                    presence_state["spatial_y"] = best.spatialCoordinates.y
                    presence_state["spatial_z"] = best.spatialCoordinates.z
                    presence_state["distance_mm"] = best.spatialCoordinates.z

                    # Grab RGB frame for face recognition + pose estimation
                    face_results = []
                    rgb_frame = None
                    if rgb_queue:
                        rgb_data = rgb_queue.tryGet()
                        if rgb_data is not None:
                            rgb_frame = rgb_data.getCvFrame()

                    # Face recognition
                    if face_recognizer and rgb_frame is not None:
                        try:
                            face_results = face_recognizer.process_frame(
                                rgb_frame, persons
                            )
                        except Exception as e:
                            logger.warning("Face recognition error: %s", e)

                    # Pose estimation (runs on Coral 2, parallel-safe)
                    if rgb_frame is not None:
                        _run_pose_estimation(rgb_frame)

                    det_list = []
                    best_recognized = None
                    best_recog_conf = 0.0
                    for i, d in enumerate(persons):
                        det = {
                            "xmin": d.xmin, "ymin": d.ymin,
                            "xmax": d.xmax, "ymax": d.ymax,
                            "confidence": d.confidence,
                            "x_mm": d.spatialCoordinates.x,
                            "y_mm": d.spatialCoordinates.y,
                            "z_mm": d.spatialCoordinates.z,
                            "distance_m": d.spatialCoordinates.z / 1000.0,
                            "recognized_name": None,
                            "recognition_confidence": None,
                        }
                        if i < len(face_results):
                            det["recognized_name"] = face_results[i]["recognized_name"]
                            det["recognition_confidence"] = face_results[i]["recognition_confidence"]
                            if det["recognized_name"] and (
                                det["recognition_confidence"] or 0
                            ) > best_recog_conf:
                                best_recognized = det["recognized_name"]
                                best_recog_conf = det["recognition_confidence"]
                        det_list.append(det)

                    presence_state["detections"] = det_list
                    presence_state["recognized_name"] = best_recognized
                    presence_state["recognition_confidence"] = (
                        round(best_recog_conf, 3) if best_recognized else None
                    )
                else:
                    presence_state["detections"] = []
                    presence_state["confidence"] = 0.0
                    presence_state["spatial_x"] = None
                    presence_state["spatial_y"] = None
                    presence_state["spatial_z"] = None
                    presence_state["distance_mm"] = None
                    presence_state["recognized_name"] = None
                    presence_state["recognition_confidence"] = None

                    # Clear pose when no person
                    if pose_state["active"]:
                        pose_state["active"] = False
                        pose_state["keypoints"] = []
                        pose_state["posture"] = {}
                        pose_state["num_valid"] = 0
                        pose_state["mean_confidence"] = 0.0

                    # Check timeout
                    if presence_state["last_seen"]:
                        if now - presence_state["last_seen"] > PRESENCE_TIMEOUT:
                            presence_state["present"] = False

            time.sleep(DETECTION_INTERVAL)

        except Exception as e:
            print(f"Detection loop error: {e}")
            time.sleep(1)

    print("🛑 SPATIAL presence detection loop stopped")


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Startup and shutdown."""
    global running, detection_thread

    print("🦊 Starting OAK-D SPATIAL Vision Service...")

    init_face_recognition()

    if init_oak():
        running = True
        detection_thread = threading.Thread(target=detection_loop, daemon=True)
        detection_thread.start()
        print("✅ Service ready!")
    else:
        print("⚠️ OAK-D not available")

    yield

    print("👋 Shutting down...")
    cleanup_oak()


app = FastAPI(
    title="OAK-D SPATIAL Vision Service",
    description="Vixy's eyes with SPATIAL presence detection + face recognition + pose estimation! 🦊👀📏",
    version="0.6.0",
    lifespan=lifespan
)


@app.get("/health")
async def health():
    """Health check."""
    return {
        "status": "healthy",
        "service": "oak-service",
        "version": "0.6.0",
        "oak_connected": pipeline_ctx is not None,
        "detection_model": DETECTION_MODEL,
        "spatial_enabled": True,
        "face_recognition_enabled": face_recognizer is not None,
        "pose_model_loaded": pose_estimator is not None,
        "timestamp": time.time()
    }


@app.get("/presence")
async def presence():
    """Get current presence state with SPATIAL data - is Foxy there and how far?"""
    distance_m = None
    if presence_state["distance_mm"] is not None:
        distance_m = presence_state["distance_mm"] / 1000.0

    return {
        "present": presence_state["present"],
        "person_count": presence_state["person_count"],
        "last_seen": presence_state["last_seen"],
        "seconds_since_seen": (
            time.time() - presence_state["last_seen"]
            if presence_state["last_seen"] else None
        ),
        "confidence": presence_state["confidence"],
        "distance_mm": presence_state["distance_mm"],
        "distance_m": distance_m,
        "spatial": {
            "x_mm": presence_state["spatial_x"],
            "y_mm": presence_state["spatial_y"],
            "z_mm": presence_state["spatial_z"],
        } if presence_state["spatial_z"] else None,
        "recognized_name": presence_state["recognized_name"],
        "recognition_confidence": presence_state["recognition_confidence"],
        "timestamp": time.time()
    }


@app.get("/detections")
async def detections():
    """Get detailed detection results with SPATIAL coordinates."""
    return {
        "person_count": presence_state["person_count"],
        "detections": presence_state["detections"],
        "last_detection": presence_state["last_detection"],
        "timestamp": time.time()
    }


@app.get("/snapshot")
async def snapshot():
    """Capture RGB frame."""
    global rgb_queue

    if rgb_queue is None:
        raise HTTPException(status_code=503, detail="OAK-D not initialized")

    try:
        frame = rgb_queue.tryGet()
        if frame is None:
            raise HTTPException(status_code=503, detail="No frame available")

        img = frame.getCvFrame()
        _, jpeg = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 85])

        return Response(content=jpeg.tobytes(), media_type="image/jpeg")
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.get("/depth")
async def depth_frame():
    """Capture colorized depth frame."""
    global depth_queue

    if depth_queue is None:
        raise HTTPException(status_code=503, detail="Depth not available")

    try:
        frame = depth_queue.tryGet()
        if frame is None:
            raise HTTPException(status_code=503, detail="No depth frame available")

        depth_data = frame.getFrame()
        # Normalize and colorize
        depth_normalized = cv2.normalize(depth_data, None, 0, 255, cv2.NORM_MINMAX)
        depth_colored = cv2.applyColorMap(depth_normalized.astype(np.uint8), cv2.COLORMAP_JET)

        _, jpeg = cv2.imencode(".jpg", depth_colored, [cv2.IMWRITE_JPEG_QUALITY, 85])

        return Response(content=jpeg.tobytes(), media_type="image/jpeg")
    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


# ============== Pose Estimation API ==============


@app.get("/pose")
async def pose():
    """Get current pose keypoints."""
    if pose_estimator is None:
        raise HTTPException(status_code=503, detail="Pose estimator not available")

    return {
        "active": pose_state["active"],
        "keypoints": pose_state["keypoints"],
        "num_valid": pose_state["num_valid"],
        "mean_confidence": pose_state["mean_confidence"],
        "inference_ms": pose_state["inference_ms"],
        "last_update": pose_state["last_update"],
        "timestamp": time.time(),
    }


@app.get("/pose/summary")
async def pose_summary():
    """Get derived posture summary."""
    if pose_estimator is None:
        raise HTTPException(status_code=503, detail="Pose estimator not available")

    return {
        "active": pose_state["active"],
        "posture": pose_state["posture"].get("posture", "unknown"),
        "facing_camera": pose_state["posture"].get("facing_camera", False),
        "arms_raised": pose_state["posture"].get("arms_raised", False),
        "mean_confidence": pose_state["mean_confidence"],
        "num_valid": pose_state["num_valid"],
        "timestamp": time.time(),
    }


# ============== Face Enrollment API ==============


@app.post("/faces/enroll")
async def enroll_face_upload(
    name: str = Form(...),
    photo: UploadFile = File(...),
):
    """Enroll a face by uploading a photo (multipart form: name + photo)."""
    if face_recognizer is None:
        raise HTTPException(status_code=503, detail="Face recognition not available")

    contents = await photo.read()
    nparr = np.frombuffer(contents, np.uint8)
    image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
    if image is None:
        raise HTTPException(status_code=400, detail="Could not decode image")

    result = face_recognizer.enroll(name, image)
    if not result["success"]:
        raise HTTPException(status_code=400, detail=result["error"])
    return result


@app.post("/faces/enroll-from-camera")
async def enroll_face_camera(name: str):
    """Enroll a face using the current camera frame. Pass name as query param."""
    if face_recognizer is None:
        raise HTTPException(status_code=503, detail="Face recognition not available")
    if rgb_queue is None:
        raise HTTPException(status_code=503, detail="Camera not available")

    frame_data = rgb_queue.tryGet()
    if frame_data is None:
        raise HTTPException(status_code=503, detail="No frame available")

    image = frame_data.getCvFrame()
    result = face_recognizer.enroll(name, image)
    if not result["success"]:
        raise HTTPException(status_code=400, detail=result["error"])
    return result


@app.get("/faces")
async def list_faces():
    """List enrolled faces."""
    if face_recognizer is None:
        raise HTTPException(status_code=503, detail="Face recognition not available")
    return {"faces": face_recognizer.list_faces()}


@app.delete("/faces/{name}")
async def delete_face(name: str):
    """Remove all embeddings for a person."""
    if face_recognizer is None:
        raise HTTPException(status_code=503, detail="Face recognition not available")
    result = face_recognizer.delete_face(name)
    if not result["success"]:
        raise HTTPException(status_code=404, detail=f"No face found for '{name}'")
    return result


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8100)