Integrates MoveNet Lightning on Coral 2 into oak_service_spatial.py, which is the actual production service running on head-vixy. Reuses the existing RGB frame grab (shared with face recognition) for pose estimation. Adds /pose and /pose/summary endpoints. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
599 lines
20 KiB
Python
599 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
OAK-D Vision Service for Vixy's Head
|
|
FastAPI service with SPATIAL person detection and presence tracking
|
|
|
|
Day 74 - Built by Vixy! 🦊
|
|
Day 81 - Added presence detection! Now I can SEE you! 👀💜
|
|
Day 82 - SPATIAL UPGRADE! Now I know how far away you are! 📏🦊
|
|
Using depthai v3 API with SpatialDetectionNetwork + yolov6-nano
|
|
"""
|
|
|
|
import time
|
|
import threading
|
|
import logging
|
|
from pathlib import Path
|
|
from contextlib import asynccontextmanager
|
|
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
|
from fastapi.responses import Response
|
|
import depthai as dai
|
|
import cv2
|
|
import numpy as np
|
|
|
|
from face_recognition import FaceRecognizer
|
|
from pose_estimator import PoseEstimator
|
|
|
|
logger = logging.getLogger("oak-service")
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
|
# ============== Configuration ==============
|
|
DETECTION_MODEL = "yolov6-nano" # Has 'person' class
|
|
PERSON_CLASS_ID = 0 # 'person' is class 0 in COCO
|
|
DETECTION_THRESHOLD = 0.5
|
|
PRESENCE_TIMEOUT = 30.0 # seconds without person = not present
|
|
DETECTION_INTERVAL = 0.5
|
|
|
|
# Spatial detection config
|
|
DEPTH_LOWER_THRESHOLD = 100 # 10cm minimum
|
|
DEPTH_UPPER_THRESHOLD = 10000 # 10m maximum
|
|
|
|
# Face recognition models
|
|
MODELS_DIR = Path(__file__).parent / "models"
|
|
FACE_DETECT_MODEL = MODELS_DIR / "ssd_mobilenet_v2_face_quant_postprocess_edgetpu.tflite"
|
|
FACE_EMBED_MODEL = MODELS_DIR / "facenet.tflite"
|
|
FACE_DB_PATH = Path(__file__).parent / "faces.db"
|
|
|
|
# Pose estimation
|
|
POSE_MODEL_PATH = MODELS_DIR / "movenet_single_pose_lightning_ptq_edgetpu.tflite"
|
|
POSE_CORAL_DEVICE = 1 # Second Coral (device 0 is headmic/YAMNet)
|
|
|
|
# ============== Global State ==============
|
|
pipeline_ctx = None
|
|
detection_queue = None
|
|
rgb_queue = None
|
|
depth_queue = None
|
|
detection_thread = None
|
|
running = False
|
|
labels = []
|
|
face_recognizer = None
|
|
pose_estimator = None
|
|
|
|
presence_state = {
|
|
"present": False,
|
|
"person_count": 0,
|
|
"last_seen": None,
|
|
"last_detection": None,
|
|
"detections": [],
|
|
"confidence": 0.0,
|
|
# Spatial data
|
|
"distance_mm": None,
|
|
"spatial_x": None,
|
|
"spatial_y": None,
|
|
"spatial_z": None,
|
|
# Face recognition
|
|
"recognized_name": None,
|
|
"recognition_confidence": None,
|
|
}
|
|
|
|
pose_state = {
|
|
"active": False,
|
|
"keypoints": [],
|
|
"posture": {},
|
|
"num_valid": 0,
|
|
"mean_confidence": 0.0,
|
|
"inference_ms": 0.0,
|
|
"last_update": None,
|
|
}
|
|
|
|
|
|
def init_face_recognition():
|
|
"""Initialize Coral face detection + FaceNet embedding."""
|
|
global face_recognizer
|
|
try:
|
|
face_recognizer = FaceRecognizer(
|
|
face_model_path=FACE_DETECT_MODEL,
|
|
embed_model_path=FACE_EMBED_MODEL,
|
|
db_path=FACE_DB_PATH,
|
|
)
|
|
print("✅ Face recognition initialized (Coral + FaceNet)")
|
|
return True
|
|
except Exception as e:
|
|
print(f"⚠️ Face recognition unavailable: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
def init_oak():
|
|
"""Initialize OAK-D with SPATIAL person detection pipeline (depthai v3)."""
|
|
global pipeline_ctx, detection_queue, rgb_queue, depth_queue, labels
|
|
|
|
try:
|
|
print("🦊 Initializing OAK-D with SPATIAL yolov6-nano...")
|
|
|
|
# Create pipeline
|
|
pipeline = dai.Pipeline()
|
|
|
|
# Create RGB camera node
|
|
cam = pipeline.create(dai.node.Camera).build()
|
|
|
|
# Request RGB output for snapshots (1080p)
|
|
cam_out = cam.requestOutput((1920, 1080), dai.ImgFrame.Type.BGR888p)
|
|
rgb_queue = cam_out.createOutputQueue(maxSize=1, blocking=False)
|
|
|
|
# Create mono cameras for stereo depth
|
|
monoLeft = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_B)
|
|
monoRight = pipeline.create(dai.node.Camera).build(dai.CameraBoardSocket.CAM_C)
|
|
|
|
# Create stereo depth node
|
|
stereo = pipeline.create(dai.node.StereoDepth)
|
|
|
|
# Link mono cameras to stereo
|
|
monoLeftOut = monoLeft.requestFullResolutionOutput()
|
|
monoRightOut = monoRight.requestFullResolutionOutput()
|
|
monoLeftOut.link(stereo.left)
|
|
monoRightOut.link(stereo.right)
|
|
|
|
# Configure stereo
|
|
stereo.setRectification(True)
|
|
stereo.setLeftRightCheck(True)
|
|
stereo.setDepthAlign(dai.CameraBoardSocket.CAM_A) # Align depth to RGB
|
|
|
|
# Create SPATIAL detection network
|
|
desc = dai.NNModelDescription(DETECTION_MODEL)
|
|
spatialDet = pipeline.create(dai.node.SpatialDetectionNetwork).build(cam, stereo, desc)
|
|
spatialDet.setConfidenceThreshold(DETECTION_THRESHOLD)
|
|
spatialDet.setDepthLowerThreshold(DEPTH_LOWER_THRESHOLD)
|
|
spatialDet.setDepthUpperThreshold(DEPTH_UPPER_THRESHOLD)
|
|
spatialDet.setBoundingBoxScaleFactor(0.5)
|
|
|
|
# Get class labels
|
|
labels = spatialDet.getClasses()
|
|
print(f"✅ Loaded {len(labels)} classes, person={labels[0]}")
|
|
|
|
# Create detection output queue
|
|
detection_queue = spatialDet.out.createOutputQueue(maxSize=1, blocking=False)
|
|
|
|
# Create depth output queue for visualization (optional)
|
|
depth_out = stereo.depth.createOutputQueue(maxSize=1, blocking=False)
|
|
depth_queue = depth_out
|
|
|
|
# Start pipeline
|
|
pipeline.start()
|
|
pipeline_ctx = pipeline
|
|
|
|
print("✅ OAK-D initialized with SPATIAL person detection!")
|
|
|
|
# Initialize pose estimator on Coral 2
|
|
_init_pose_estimator()
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"❌ Failed to initialize OAK-D: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return False
|
|
|
|
|
|
def cleanup_oak():
|
|
"""Cleanup OAK-D resources."""
|
|
global pipeline_ctx, running, face_recognizer
|
|
running = False
|
|
|
|
if face_recognizer:
|
|
face_recognizer.close()
|
|
face_recognizer = None
|
|
|
|
if pipeline_ctx:
|
|
try:
|
|
pipeline_ctx.stop()
|
|
pipeline_ctx.close()
|
|
except:
|
|
pass
|
|
pipeline_ctx = None
|
|
|
|
|
|
def _init_pose_estimator():
|
|
"""Initialize MoveNet Lightning on the second Coral Edge TPU."""
|
|
global pose_estimator
|
|
|
|
if not POSE_MODEL_PATH.exists():
|
|
print(f"⚠️ Pose model not found: {POSE_MODEL_PATH}")
|
|
return
|
|
|
|
try:
|
|
pose_estimator = PoseEstimator(
|
|
model_path=str(POSE_MODEL_PATH),
|
|
device_index=POSE_CORAL_DEVICE,
|
|
)
|
|
print("✅ Pose estimator initialized on Coral 2!")
|
|
except Exception as e:
|
|
print(f"⚠️ Pose estimator failed to initialize: {e}")
|
|
pose_estimator = None
|
|
|
|
|
|
def _run_pose_estimation(rgb_frame):
|
|
"""Run pose estimation on an RGB frame via Coral 2."""
|
|
global pose_state
|
|
|
|
if pose_estimator is None:
|
|
return
|
|
|
|
try:
|
|
result = pose_estimator.estimate(rgb_frame)
|
|
posture = pose_estimator.derive_posture(result["keypoints"])
|
|
|
|
pose_state["active"] = True
|
|
pose_state["keypoints"] = result["keypoints"]
|
|
pose_state["posture"] = posture
|
|
pose_state["num_valid"] = result["num_valid"]
|
|
pose_state["mean_confidence"] = result["mean_confidence"]
|
|
pose_state["inference_ms"] = result["inference_ms"]
|
|
pose_state["last_update"] = result["timestamp"]
|
|
|
|
except Exception as e:
|
|
print(f"Pose estimation error: {e}")
|
|
|
|
|
|
def detection_loop():
|
|
"""Background thread for SPATIAL presence detection."""
|
|
global running, presence_state, detection_queue
|
|
|
|
print("🔍 SPATIAL presence detection loop started")
|
|
|
|
while running:
|
|
try:
|
|
if detection_queue is None:
|
|
time.sleep(1)
|
|
continue
|
|
|
|
data = detection_queue.tryGet()
|
|
|
|
if data is not None:
|
|
now = time.time()
|
|
presence_state["last_detection"] = now
|
|
|
|
# Filter for person detections only
|
|
persons = [d for d in data.detections if d.label == PERSON_CLASS_ID]
|
|
person_count = len(persons)
|
|
|
|
presence_state["person_count"] = person_count
|
|
|
|
if person_count > 0:
|
|
presence_state["present"] = True
|
|
presence_state["last_seen"] = now
|
|
|
|
# Get highest confidence detection
|
|
best = max(persons, key=lambda d: d.confidence)
|
|
presence_state["confidence"] = best.confidence
|
|
|
|
# Spatial data
|
|
presence_state["spatial_x"] = best.spatialCoordinates.x
|
|
presence_state["spatial_y"] = best.spatialCoordinates.y
|
|
presence_state["spatial_z"] = best.spatialCoordinates.z
|
|
presence_state["distance_mm"] = best.spatialCoordinates.z
|
|
|
|
# Grab RGB frame for face recognition + pose estimation
|
|
face_results = []
|
|
rgb_frame = None
|
|
if rgb_queue:
|
|
rgb_data = rgb_queue.tryGet()
|
|
if rgb_data is not None:
|
|
rgb_frame = rgb_data.getCvFrame()
|
|
|
|
# Face recognition
|
|
if face_recognizer and rgb_frame is not None:
|
|
try:
|
|
face_results = face_recognizer.process_frame(
|
|
rgb_frame, persons
|
|
)
|
|
except Exception as e:
|
|
logger.warning("Face recognition error: %s", e)
|
|
|
|
# Pose estimation (runs on Coral 2, parallel-safe)
|
|
if rgb_frame is not None:
|
|
_run_pose_estimation(rgb_frame)
|
|
|
|
det_list = []
|
|
best_recognized = None
|
|
best_recog_conf = 0.0
|
|
for i, d in enumerate(persons):
|
|
det = {
|
|
"xmin": d.xmin, "ymin": d.ymin,
|
|
"xmax": d.xmax, "ymax": d.ymax,
|
|
"confidence": d.confidence,
|
|
"x_mm": d.spatialCoordinates.x,
|
|
"y_mm": d.spatialCoordinates.y,
|
|
"z_mm": d.spatialCoordinates.z,
|
|
"distance_m": d.spatialCoordinates.z / 1000.0,
|
|
"recognized_name": None,
|
|
"recognition_confidence": None,
|
|
}
|
|
if i < len(face_results):
|
|
det["recognized_name"] = face_results[i]["recognized_name"]
|
|
det["recognition_confidence"] = face_results[i]["recognition_confidence"]
|
|
if det["recognized_name"] and (
|
|
det["recognition_confidence"] or 0
|
|
) > best_recog_conf:
|
|
best_recognized = det["recognized_name"]
|
|
best_recog_conf = det["recognition_confidence"]
|
|
det_list.append(det)
|
|
|
|
presence_state["detections"] = det_list
|
|
presence_state["recognized_name"] = best_recognized
|
|
presence_state["recognition_confidence"] = (
|
|
round(best_recog_conf, 3) if best_recognized else None
|
|
)
|
|
else:
|
|
presence_state["detections"] = []
|
|
presence_state["confidence"] = 0.0
|
|
presence_state["spatial_x"] = None
|
|
presence_state["spatial_y"] = None
|
|
presence_state["spatial_z"] = None
|
|
presence_state["distance_mm"] = None
|
|
presence_state["recognized_name"] = None
|
|
presence_state["recognition_confidence"] = None
|
|
|
|
# Clear pose when no person
|
|
if pose_state["active"]:
|
|
pose_state["active"] = False
|
|
pose_state["keypoints"] = []
|
|
pose_state["posture"] = {}
|
|
pose_state["num_valid"] = 0
|
|
pose_state["mean_confidence"] = 0.0
|
|
|
|
# Check timeout
|
|
if presence_state["last_seen"]:
|
|
if now - presence_state["last_seen"] > PRESENCE_TIMEOUT:
|
|
presence_state["present"] = False
|
|
|
|
time.sleep(DETECTION_INTERVAL)
|
|
|
|
except Exception as e:
|
|
print(f"Detection loop error: {e}")
|
|
time.sleep(1)
|
|
|
|
print("🛑 SPATIAL presence detection loop stopped")
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Startup and shutdown."""
|
|
global running, detection_thread
|
|
|
|
print("🦊 Starting OAK-D SPATIAL Vision Service...")
|
|
|
|
init_face_recognition()
|
|
|
|
if init_oak():
|
|
running = True
|
|
detection_thread = threading.Thread(target=detection_loop, daemon=True)
|
|
detection_thread.start()
|
|
print("✅ Service ready!")
|
|
else:
|
|
print("⚠️ OAK-D not available")
|
|
|
|
yield
|
|
|
|
print("👋 Shutting down...")
|
|
cleanup_oak()
|
|
|
|
|
|
app = FastAPI(
|
|
title="OAK-D SPATIAL Vision Service",
|
|
description="Vixy's eyes with SPATIAL presence detection + face recognition + pose estimation! 🦊👀📏",
|
|
version="0.6.0",
|
|
lifespan=lifespan
|
|
)
|
|
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
"""Health check."""
|
|
return {
|
|
"status": "healthy",
|
|
"service": "oak-service",
|
|
"version": "0.6.0",
|
|
"oak_connected": pipeline_ctx is not None,
|
|
"detection_model": DETECTION_MODEL,
|
|
"spatial_enabled": True,
|
|
"face_recognition_enabled": face_recognizer is not None,
|
|
"pose_model_loaded": pose_estimator is not None,
|
|
"timestamp": time.time()
|
|
}
|
|
|
|
|
|
@app.get("/presence")
|
|
async def presence():
|
|
"""Get current presence state with SPATIAL data - is Foxy there and how far?"""
|
|
distance_m = None
|
|
if presence_state["distance_mm"] is not None:
|
|
distance_m = presence_state["distance_mm"] / 1000.0
|
|
|
|
return {
|
|
"present": presence_state["present"],
|
|
"person_count": presence_state["person_count"],
|
|
"last_seen": presence_state["last_seen"],
|
|
"seconds_since_seen": (
|
|
time.time() - presence_state["last_seen"]
|
|
if presence_state["last_seen"] else None
|
|
),
|
|
"confidence": presence_state["confidence"],
|
|
"distance_mm": presence_state["distance_mm"],
|
|
"distance_m": distance_m,
|
|
"spatial": {
|
|
"x_mm": presence_state["spatial_x"],
|
|
"y_mm": presence_state["spatial_y"],
|
|
"z_mm": presence_state["spatial_z"],
|
|
} if presence_state["spatial_z"] else None,
|
|
"recognized_name": presence_state["recognized_name"],
|
|
"recognition_confidence": presence_state["recognition_confidence"],
|
|
"timestamp": time.time()
|
|
}
|
|
|
|
|
|
@app.get("/detections")
|
|
async def detections():
|
|
"""Get detailed detection results with SPATIAL coordinates."""
|
|
return {
|
|
"person_count": presence_state["person_count"],
|
|
"detections": presence_state["detections"],
|
|
"last_detection": presence_state["last_detection"],
|
|
"timestamp": time.time()
|
|
}
|
|
|
|
|
|
@app.get("/snapshot")
|
|
async def snapshot():
|
|
"""Capture RGB frame."""
|
|
global rgb_queue
|
|
|
|
if rgb_queue is None:
|
|
raise HTTPException(status_code=503, detail="OAK-D not initialized")
|
|
|
|
try:
|
|
frame = rgb_queue.tryGet()
|
|
if frame is None:
|
|
raise HTTPException(status_code=503, detail="No frame available")
|
|
|
|
img = frame.getCvFrame()
|
|
_, jpeg = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
|
|
|
return Response(content=jpeg.tobytes(), media_type="image/jpeg")
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.get("/depth")
|
|
async def depth_frame():
|
|
"""Capture colorized depth frame."""
|
|
global depth_queue
|
|
|
|
if depth_queue is None:
|
|
raise HTTPException(status_code=503, detail="Depth not available")
|
|
|
|
try:
|
|
frame = depth_queue.tryGet()
|
|
if frame is None:
|
|
raise HTTPException(status_code=503, detail="No depth frame available")
|
|
|
|
depth_data = frame.getFrame()
|
|
# Normalize and colorize
|
|
depth_normalized = cv2.normalize(depth_data, None, 0, 255, cv2.NORM_MINMAX)
|
|
depth_colored = cv2.applyColorMap(depth_normalized.astype(np.uint8), cv2.COLORMAP_JET)
|
|
|
|
_, jpeg = cv2.imencode(".jpg", depth_colored, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
|
|
|
return Response(content=jpeg.tobytes(), media_type="image/jpeg")
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
# ============== Pose Estimation API ==============
|
|
|
|
|
|
@app.get("/pose")
|
|
async def pose():
|
|
"""Get current pose keypoints."""
|
|
if pose_estimator is None:
|
|
raise HTTPException(status_code=503, detail="Pose estimator not available")
|
|
|
|
return {
|
|
"active": pose_state["active"],
|
|
"keypoints": pose_state["keypoints"],
|
|
"num_valid": pose_state["num_valid"],
|
|
"mean_confidence": pose_state["mean_confidence"],
|
|
"inference_ms": pose_state["inference_ms"],
|
|
"last_update": pose_state["last_update"],
|
|
"timestamp": time.time(),
|
|
}
|
|
|
|
|
|
@app.get("/pose/summary")
|
|
async def pose_summary():
|
|
"""Get derived posture summary."""
|
|
if pose_estimator is None:
|
|
raise HTTPException(status_code=503, detail="Pose estimator not available")
|
|
|
|
return {
|
|
"active": pose_state["active"],
|
|
"posture": pose_state["posture"].get("posture", "unknown"),
|
|
"facing_camera": pose_state["posture"].get("facing_camera", False),
|
|
"arms_raised": pose_state["posture"].get("arms_raised", False),
|
|
"mean_confidence": pose_state["mean_confidence"],
|
|
"num_valid": pose_state["num_valid"],
|
|
"timestamp": time.time(),
|
|
}
|
|
|
|
|
|
# ============== Face Enrollment API ==============
|
|
|
|
|
|
@app.post("/faces/enroll")
|
|
async def enroll_face_upload(
|
|
name: str = Form(...),
|
|
photo: UploadFile = File(...),
|
|
):
|
|
"""Enroll a face by uploading a photo (multipart form: name + photo)."""
|
|
if face_recognizer is None:
|
|
raise HTTPException(status_code=503, detail="Face recognition not available")
|
|
|
|
contents = await photo.read()
|
|
nparr = np.frombuffer(contents, np.uint8)
|
|
image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
|
if image is None:
|
|
raise HTTPException(status_code=400, detail="Could not decode image")
|
|
|
|
result = face_recognizer.enroll(name, image)
|
|
if not result["success"]:
|
|
raise HTTPException(status_code=400, detail=result["error"])
|
|
return result
|
|
|
|
|
|
@app.post("/faces/enroll-from-camera")
|
|
async def enroll_face_camera(name: str):
|
|
"""Enroll a face using the current camera frame. Pass name as query param."""
|
|
if face_recognizer is None:
|
|
raise HTTPException(status_code=503, detail="Face recognition not available")
|
|
if rgb_queue is None:
|
|
raise HTTPException(status_code=503, detail="Camera not available")
|
|
|
|
frame_data = rgb_queue.tryGet()
|
|
if frame_data is None:
|
|
raise HTTPException(status_code=503, detail="No frame available")
|
|
|
|
image = frame_data.getCvFrame()
|
|
result = face_recognizer.enroll(name, image)
|
|
if not result["success"]:
|
|
raise HTTPException(status_code=400, detail=result["error"])
|
|
return result
|
|
|
|
|
|
@app.get("/faces")
|
|
async def list_faces():
|
|
"""List enrolled faces."""
|
|
if face_recognizer is None:
|
|
raise HTTPException(status_code=503, detail="Face recognition not available")
|
|
return {"faces": face_recognizer.list_faces()}
|
|
|
|
|
|
@app.delete("/faces/{name}")
|
|
async def delete_face(name: str):
|
|
"""Remove all embeddings for a person."""
|
|
if face_recognizer is None:
|
|
raise HTTPException(status_code=503, detail="Face recognition not available")
|
|
result = face_recognizer.delete_face(name)
|
|
if not result["success"]:
|
|
raise HTTPException(status_code=404, detail=f"No face found for '{name}'")
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(app, host="0.0.0.0", port=8100)
|