diff --git a/face_recognition.py b/face_recognition.py index bd31a33..a26a4bb 100644 --- a/face_recognition.py +++ b/face_recognition.py @@ -14,10 +14,13 @@ import cv2 import numpy as np logger = logging.getLogger("face_recognition") +logger.setLevel(logging.INFO) FACE_DETECT_THRESHOLD = 0.5 RECOGNITION_THRESHOLD = 0.5 EMBEDDING_DIM = 512 +MIN_CROP_SIZE = 80 # minimum pixels in both dimensions for face detection +FACE_DETECT_INTERVAL = 2.0 # seconds between face detection runs in process_frame class FaceRecognizer: @@ -34,6 +37,42 @@ class FaceRecognizer: self._face_interp.allocate_tensors() self._face_input = self._face_interp.get_input_details()[0] self._face_outputs = self._face_interp.get_output_details() + + # Log output tensor details to determine correct index mapping + for i, o in enumerate(self._face_outputs): + logger.info("Face detector output[%d]: name=%s shape=%s", i, o["name"], o["shape"]) + + # Determine score tensor index: run a test inference to find which + # [1,N] tensor has non-zero values (scores) vs all-zeros (class IDs) + inp_shape = self._face_input["shape"] + test_input = np.zeros(inp_shape, dtype=self._face_input["dtype"]) + self._face_interp.set_tensor(self._face_input["index"], test_input) + self._face_interp.invoke() + + # Output 0 is boxes [1,N,4], output 3 is count [1] + # Outputs 1 and 2 are scores and classes (order varies by model) + t1 = self._face_interp.get_tensor(self._face_outputs[1]["index"]) + t2 = self._face_interp.get_tensor(self._face_outputs[2]["index"]) + # For a blank image: scores should be low but potentially non-zero, + # while class IDs for a single-class model are always 0.0 + # Use output name as primary signal if available + self._score_output_idx = 2 # default + for i in (1, 2): + name = self._face_outputs[i].get("name", "").lower() + if "score" in name: + self._score_output_idx = i + break + if "class" in name: + self._score_output_idx = 2 if i == 1 else 1 + break + else: + # No name match — use heuristic: pick the one with higher variance + if np.std(t1) > np.std(t2): + self._score_output_idx = 1 + else: + self._score_output_idx = 2 + + logger.info("Face detector: using output[%d] as scores", self._score_output_idx) logger.info( "Face detector ready: input %s %s", self._face_input["shape"], @@ -74,6 +113,10 @@ class FaceRecognizer: self._reload_cache() logger.info("Face DB: %d embeddings loaded", len(self._cache)) + # Throttling for process_frame + self._last_face_detect_time = 0.0 + self._last_face_results = [] # cached results from last detection + def _reload_cache(self): rows = self._db.execute("SELECT name, embedding FROM faces").fetchall() cache = [] @@ -86,18 +129,33 @@ class FaceRecognizer: def _detect_face(self, image): """Run face detection on Coral. Returns best face bbox (y1,x1,y2,x2 in pixels) or None.""" h, w = image.shape[:2] + + # Skip if crop is too small for reliable face detection + if h < MIN_CROP_SIZE or w < MIN_CROP_SIZE: + logger.debug("detect_face: skipping %dx%d crop (too small)", w, h) + return None, 0.0 + inp_h, inp_w = self._face_input["shape"][1:3] - resized = cv2.resize(image, (inp_w, inp_h)) - if resized.dtype != np.uint8: - resized = resized.astype(np.uint8) + + # Letterbox resize: preserve aspect ratio, pad with black + scale = min(inp_w / w, inp_h / h) + new_w = int(w * scale) + new_h = int(h * scale) + resized = cv2.resize(image, (new_w, new_h)) + + # Create padded input + padded = np.zeros((inp_h, inp_w, 3), dtype=np.uint8) + pad_y = (inp_h - new_h) // 2 + pad_x = (inp_w - new_w) // 2 + padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized + self._face_interp.set_tensor( - self._face_input["index"], resized[np.newaxis] + self._face_input["index"], padded[np.newaxis] ) self._face_interp.invoke() - # Parse outputs: boxes [1,50,4], classes [1,50], scores [1,50], count [1] boxes = self._face_interp.get_tensor(self._face_outputs[0]["index"])[0] - scores = self._face_interp.get_tensor(self._face_outputs[2]["index"])[0] + scores = self._face_interp.get_tensor(self._face_outputs[self._score_output_idx]["index"])[0] count = int( self._face_interp.get_tensor(self._face_outputs[3]["index"])[0] ) @@ -108,14 +166,17 @@ class FaceRecognizer: if scores[i] >= FACE_DETECT_THRESHOLD and scores[i] > best_score: best_score = scores[i] # boxes are [ymin, xmin, ymax, xmax] normalized 0-1 + # Map back from letterboxed coords to original image coords ymin, xmin, ymax, xmax = boxes[i] - best_box = ( - max(0, int(ymin * h)), - max(0, int(xmin * w)), - min(h, int(ymax * h)), - min(w, int(xmax * w)), - ) + # Convert from padded coords to original + orig_y1 = max(0, int((ymin * inp_h - pad_y) / scale)) + orig_x1 = max(0, int((xmin * inp_w - pad_x) / scale)) + orig_y2 = min(h, int((ymax * inp_h - pad_y) / scale)) + orig_x2 = min(w, int((xmax * inp_w - pad_x) / scale)) + best_box = (orig_y1, orig_x1, orig_y2, orig_x2) + if best_box is not None: + logger.debug("detect_face: %dx%d -> face at score=%.2f", w, h, best_score) return best_box, best_score def _compute_embedding(self, face_image): @@ -165,25 +226,37 @@ class FaceRecognizer: list of dicts (same order as person_detections): {recognized_name: str|None, recognition_confidence: float|None} """ + now = time.monotonic() + if now - self._last_face_detect_time < FACE_DETECT_INTERVAL: + return self._last_face_results + + self._last_face_detect_time = now h, w = rgb_frame.shape[:2] results = [] for det in person_detections: - # Crop upper 40% of person bbox (head + shoulders) + # Crop upper 50% of person bbox as a roughly square region px1 = max(0, int(det.xmin * w)) py1 = max(0, int(det.ymin * h)) px2 = min(w, int(det.xmax * w)) py2 = min(h, int(det.ymax * h)) + bbox_w = px2 - px1 bbox_h = py2 - py1 - upper_y2 = py1 + int(bbox_h * 0.4) + upper_h = int(bbox_h * 0.5) - # Add 10% horizontal padding - pad_x = int((px2 - px1) * 0.1) - crop_x1 = max(0, px1 - pad_x) - crop_x2 = min(w, px2 + pad_x) + # Make crop roughly square: if width >> height, narrow it + # Center the crop horizontally on the person bbox + crop_h = upper_h + crop_w = max(bbox_w, upper_h) # at least as wide as tall + if bbox_w > upper_h * 2: + # Very wide bbox — narrow to ~1.5x the height, centered + crop_w = int(upper_h * 1.5) + cx = (px1 + px2) // 2 + crop_x1 = max(0, cx - crop_w // 2) + crop_x2 = min(w, cx + crop_w // 2) - crop = rgb_frame[py1:upper_y2, crop_x1:crop_x2] + crop = rgb_frame[py1:py1 + crop_h, crop_x1:crop_x2] if crop.size == 0: results.append({"recognized_name": None, "recognition_confidence": None}) continue @@ -209,6 +282,7 @@ class FaceRecognizer: "recognition_confidence": round(confidence, 3), }) + self._last_face_results = results return results def enroll(self, name, image):