Fix face detection: letterbox resize, smart cropping, throttle
- Use letterbox resize (preserve aspect ratio + pad) instead of stretching to 320x320. Stretching 16:9 frames caused faces to be undetectable. - Auto-detect score tensor output index at init time (name + variance heuristic) - Smart upper-body crop: roughly square region instead of thin wide strip - Throttle face detection to every 2s to reduce Coral USB traffic - Skip crops smaller than 80px (too small for reliable detection) - Reduce log level from DEBUG to INFO Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -14,10 +14,13 @@ import cv2
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
logger = logging.getLogger("face_recognition")
|
logger = logging.getLogger("face_recognition")
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
FACE_DETECT_THRESHOLD = 0.5
|
FACE_DETECT_THRESHOLD = 0.5
|
||||||
RECOGNITION_THRESHOLD = 0.5
|
RECOGNITION_THRESHOLD = 0.5
|
||||||
EMBEDDING_DIM = 512
|
EMBEDDING_DIM = 512
|
||||||
|
MIN_CROP_SIZE = 80 # minimum pixels in both dimensions for face detection
|
||||||
|
FACE_DETECT_INTERVAL = 2.0 # seconds between face detection runs in process_frame
|
||||||
|
|
||||||
|
|
||||||
class FaceRecognizer:
|
class FaceRecognizer:
|
||||||
@@ -34,6 +37,42 @@ class FaceRecognizer:
|
|||||||
self._face_interp.allocate_tensors()
|
self._face_interp.allocate_tensors()
|
||||||
self._face_input = self._face_interp.get_input_details()[0]
|
self._face_input = self._face_interp.get_input_details()[0]
|
||||||
self._face_outputs = self._face_interp.get_output_details()
|
self._face_outputs = self._face_interp.get_output_details()
|
||||||
|
|
||||||
|
# Log output tensor details to determine correct index mapping
|
||||||
|
for i, o in enumerate(self._face_outputs):
|
||||||
|
logger.info("Face detector output[%d]: name=%s shape=%s", i, o["name"], o["shape"])
|
||||||
|
|
||||||
|
# Determine score tensor index: run a test inference to find which
|
||||||
|
# [1,N] tensor has non-zero values (scores) vs all-zeros (class IDs)
|
||||||
|
inp_shape = self._face_input["shape"]
|
||||||
|
test_input = np.zeros(inp_shape, dtype=self._face_input["dtype"])
|
||||||
|
self._face_interp.set_tensor(self._face_input["index"], test_input)
|
||||||
|
self._face_interp.invoke()
|
||||||
|
|
||||||
|
# Output 0 is boxes [1,N,4], output 3 is count [1]
|
||||||
|
# Outputs 1 and 2 are scores and classes (order varies by model)
|
||||||
|
t1 = self._face_interp.get_tensor(self._face_outputs[1]["index"])
|
||||||
|
t2 = self._face_interp.get_tensor(self._face_outputs[2]["index"])
|
||||||
|
# For a blank image: scores should be low but potentially non-zero,
|
||||||
|
# while class IDs for a single-class model are always 0.0
|
||||||
|
# Use output name as primary signal if available
|
||||||
|
self._score_output_idx = 2 # default
|
||||||
|
for i in (1, 2):
|
||||||
|
name = self._face_outputs[i].get("name", "").lower()
|
||||||
|
if "score" in name:
|
||||||
|
self._score_output_idx = i
|
||||||
|
break
|
||||||
|
if "class" in name:
|
||||||
|
self._score_output_idx = 2 if i == 1 else 1
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# No name match — use heuristic: pick the one with higher variance
|
||||||
|
if np.std(t1) > np.std(t2):
|
||||||
|
self._score_output_idx = 1
|
||||||
|
else:
|
||||||
|
self._score_output_idx = 2
|
||||||
|
|
||||||
|
logger.info("Face detector: using output[%d] as scores", self._score_output_idx)
|
||||||
logger.info(
|
logger.info(
|
||||||
"Face detector ready: input %s %s",
|
"Face detector ready: input %s %s",
|
||||||
self._face_input["shape"],
|
self._face_input["shape"],
|
||||||
@@ -74,6 +113,10 @@ class FaceRecognizer:
|
|||||||
self._reload_cache()
|
self._reload_cache()
|
||||||
logger.info("Face DB: %d embeddings loaded", len(self._cache))
|
logger.info("Face DB: %d embeddings loaded", len(self._cache))
|
||||||
|
|
||||||
|
# Throttling for process_frame
|
||||||
|
self._last_face_detect_time = 0.0
|
||||||
|
self._last_face_results = [] # cached results from last detection
|
||||||
|
|
||||||
def _reload_cache(self):
|
def _reload_cache(self):
|
||||||
rows = self._db.execute("SELECT name, embedding FROM faces").fetchall()
|
rows = self._db.execute("SELECT name, embedding FROM faces").fetchall()
|
||||||
cache = []
|
cache = []
|
||||||
@@ -86,18 +129,33 @@ class FaceRecognizer:
|
|||||||
def _detect_face(self, image):
|
def _detect_face(self, image):
|
||||||
"""Run face detection on Coral. Returns best face bbox (y1,x1,y2,x2 in pixels) or None."""
|
"""Run face detection on Coral. Returns best face bbox (y1,x1,y2,x2 in pixels) or None."""
|
||||||
h, w = image.shape[:2]
|
h, w = image.shape[:2]
|
||||||
|
|
||||||
|
# Skip if crop is too small for reliable face detection
|
||||||
|
if h < MIN_CROP_SIZE or w < MIN_CROP_SIZE:
|
||||||
|
logger.debug("detect_face: skipping %dx%d crop (too small)", w, h)
|
||||||
|
return None, 0.0
|
||||||
|
|
||||||
inp_h, inp_w = self._face_input["shape"][1:3]
|
inp_h, inp_w = self._face_input["shape"][1:3]
|
||||||
resized = cv2.resize(image, (inp_w, inp_h))
|
|
||||||
if resized.dtype != np.uint8:
|
# Letterbox resize: preserve aspect ratio, pad with black
|
||||||
resized = resized.astype(np.uint8)
|
scale = min(inp_w / w, inp_h / h)
|
||||||
|
new_w = int(w * scale)
|
||||||
|
new_h = int(h * scale)
|
||||||
|
resized = cv2.resize(image, (new_w, new_h))
|
||||||
|
|
||||||
|
# Create padded input
|
||||||
|
padded = np.zeros((inp_h, inp_w, 3), dtype=np.uint8)
|
||||||
|
pad_y = (inp_h - new_h) // 2
|
||||||
|
pad_x = (inp_w - new_w) // 2
|
||||||
|
padded[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = resized
|
||||||
|
|
||||||
self._face_interp.set_tensor(
|
self._face_interp.set_tensor(
|
||||||
self._face_input["index"], resized[np.newaxis]
|
self._face_input["index"], padded[np.newaxis]
|
||||||
)
|
)
|
||||||
self._face_interp.invoke()
|
self._face_interp.invoke()
|
||||||
|
|
||||||
# Parse outputs: boxes [1,50,4], classes [1,50], scores [1,50], count [1]
|
|
||||||
boxes = self._face_interp.get_tensor(self._face_outputs[0]["index"])[0]
|
boxes = self._face_interp.get_tensor(self._face_outputs[0]["index"])[0]
|
||||||
scores = self._face_interp.get_tensor(self._face_outputs[2]["index"])[0]
|
scores = self._face_interp.get_tensor(self._face_outputs[self._score_output_idx]["index"])[0]
|
||||||
count = int(
|
count = int(
|
||||||
self._face_interp.get_tensor(self._face_outputs[3]["index"])[0]
|
self._face_interp.get_tensor(self._face_outputs[3]["index"])[0]
|
||||||
)
|
)
|
||||||
@@ -108,14 +166,17 @@ class FaceRecognizer:
|
|||||||
if scores[i] >= FACE_DETECT_THRESHOLD and scores[i] > best_score:
|
if scores[i] >= FACE_DETECT_THRESHOLD and scores[i] > best_score:
|
||||||
best_score = scores[i]
|
best_score = scores[i]
|
||||||
# boxes are [ymin, xmin, ymax, xmax] normalized 0-1
|
# boxes are [ymin, xmin, ymax, xmax] normalized 0-1
|
||||||
|
# Map back from letterboxed coords to original image coords
|
||||||
ymin, xmin, ymax, xmax = boxes[i]
|
ymin, xmin, ymax, xmax = boxes[i]
|
||||||
best_box = (
|
# Convert from padded coords to original
|
||||||
max(0, int(ymin * h)),
|
orig_y1 = max(0, int((ymin * inp_h - pad_y) / scale))
|
||||||
max(0, int(xmin * w)),
|
orig_x1 = max(0, int((xmin * inp_w - pad_x) / scale))
|
||||||
min(h, int(ymax * h)),
|
orig_y2 = min(h, int((ymax * inp_h - pad_y) / scale))
|
||||||
min(w, int(xmax * w)),
|
orig_x2 = min(w, int((xmax * inp_w - pad_x) / scale))
|
||||||
)
|
best_box = (orig_y1, orig_x1, orig_y2, orig_x2)
|
||||||
|
|
||||||
|
if best_box is not None:
|
||||||
|
logger.debug("detect_face: %dx%d -> face at score=%.2f", w, h, best_score)
|
||||||
return best_box, best_score
|
return best_box, best_score
|
||||||
|
|
||||||
def _compute_embedding(self, face_image):
|
def _compute_embedding(self, face_image):
|
||||||
@@ -165,25 +226,37 @@ class FaceRecognizer:
|
|||||||
list of dicts (same order as person_detections):
|
list of dicts (same order as person_detections):
|
||||||
{recognized_name: str|None, recognition_confidence: float|None}
|
{recognized_name: str|None, recognition_confidence: float|None}
|
||||||
"""
|
"""
|
||||||
|
now = time.monotonic()
|
||||||
|
if now - self._last_face_detect_time < FACE_DETECT_INTERVAL:
|
||||||
|
return self._last_face_results
|
||||||
|
|
||||||
|
self._last_face_detect_time = now
|
||||||
h, w = rgb_frame.shape[:2]
|
h, w = rgb_frame.shape[:2]
|
||||||
results = []
|
results = []
|
||||||
|
|
||||||
for det in person_detections:
|
for det in person_detections:
|
||||||
# Crop upper 40% of person bbox (head + shoulders)
|
# Crop upper 50% of person bbox as a roughly square region
|
||||||
px1 = max(0, int(det.xmin * w))
|
px1 = max(0, int(det.xmin * w))
|
||||||
py1 = max(0, int(det.ymin * h))
|
py1 = max(0, int(det.ymin * h))
|
||||||
px2 = min(w, int(det.xmax * w))
|
px2 = min(w, int(det.xmax * w))
|
||||||
py2 = min(h, int(det.ymax * h))
|
py2 = min(h, int(det.ymax * h))
|
||||||
|
|
||||||
|
bbox_w = px2 - px1
|
||||||
bbox_h = py2 - py1
|
bbox_h = py2 - py1
|
||||||
upper_y2 = py1 + int(bbox_h * 0.4)
|
upper_h = int(bbox_h * 0.5)
|
||||||
|
|
||||||
# Add 10% horizontal padding
|
# Make crop roughly square: if width >> height, narrow it
|
||||||
pad_x = int((px2 - px1) * 0.1)
|
# Center the crop horizontally on the person bbox
|
||||||
crop_x1 = max(0, px1 - pad_x)
|
crop_h = upper_h
|
||||||
crop_x2 = min(w, px2 + pad_x)
|
crop_w = max(bbox_w, upper_h) # at least as wide as tall
|
||||||
|
if bbox_w > upper_h * 2:
|
||||||
|
# Very wide bbox — narrow to ~1.5x the height, centered
|
||||||
|
crop_w = int(upper_h * 1.5)
|
||||||
|
cx = (px1 + px2) // 2
|
||||||
|
crop_x1 = max(0, cx - crop_w // 2)
|
||||||
|
crop_x2 = min(w, cx + crop_w // 2)
|
||||||
|
|
||||||
crop = rgb_frame[py1:upper_y2, crop_x1:crop_x2]
|
crop = rgb_frame[py1:py1 + crop_h, crop_x1:crop_x2]
|
||||||
if crop.size == 0:
|
if crop.size == 0:
|
||||||
results.append({"recognized_name": None, "recognition_confidence": None})
|
results.append({"recognized_name": None, "recognition_confidence": None})
|
||||||
continue
|
continue
|
||||||
@@ -209,6 +282,7 @@ class FaceRecognizer:
|
|||||||
"recognition_confidence": round(confidence, 3),
|
"recognition_confidence": round(confidence, 3),
|
||||||
})
|
})
|
||||||
|
|
||||||
|
self._last_face_results = results
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def enroll(self, name, image):
|
def enroll(self, name, image):
|
||||||
|
|||||||
Reference in New Issue
Block a user