Files
yolov26_3d/ultralytics/utils/plotting_3d.py
2026-06-24 09:35:46 +08:00

2777 lines
106 KiB
Python
Executable File

# Ultralytics AGPL-3.0 License - https://ultralytics.com/license
"""3D detection visualization utilities.
Provides functions for decoding 3D predictions, projecting 3D boxes to 2D,
and drawing 3D wireframe boxes on images. Ported from yolov5-3d/utils/plots.py.
"""
import cv2
import numpy as np
def _default_face_visibility_score_thresh():
"""Return the configured visible-face threshold, even when this module is imported standalone."""
try:
from ultralytics.utils import DEFAULT_CFG
return float(getattr(DEFAULT_CFG, "face_visibility_score_thresh", 0.05))
except Exception:
return 0.05
YAW_BIN_OFFSETS = (0.0, np.pi / 2, -np.pi / 2, np.pi)
FACE_OFFSETS_42 = (10, 18, 26, 34)
FACE_OFFSETS_41 = (0, 6, 12, 18)
FACE_EDGE_OFFSETS_60 = (0, 15, 30, 45)
FACE_CORNERS = {0: (4, 5, 6, 7), 1: (0, 1, 2, 3), 2: (1, 2, 5, 6), 3: (0, 3, 4, 7)}
FACE_BOTTOM_EDGE_CORNERS = {0: (6, 7), 1: (2, 3), 2: (2, 6), 3: (3, 7)}
FACE_VISIBILITY_SCORE_THRESH = _default_face_visibility_score_thresh()
# Edge-yaw keeps the face-based visible-face threshold for the primary face, but uses a stricter gate for the
# optional second face in the two-face bucket.
EDGE_YAW_VALID_VISIBILITY_SCORE_THRESH = 0.1
EDGE_YAW_CUT_SIDE_MIN_VISIBLE_LENGTH_RATIO = 0.5
EDGE_YAW_MAX_LATERAL_DIST_M = 30.0
CUT_STATE_NORMAL = 0
CUT_STATE_IN = 1
CUT_STATE_OUT = 2
FACE_COLORS = ((0, 0, 255), (255, 0, 0), (0, 255, 0), (0, 255, 255))
def rotation_3d_in_axis(points, angles, axis=1):
"""Rotate points around a specified axis.
Args:
points: (N, 3) array of 3D points.
angles: Rotation angle in radians (scalar).
axis: 0=X, 1=Y, 2=Z.
Returns:
Rotated points (N, 3).
"""
rot_sin = np.sin(angles)
rot_cos = np.cos(angles)
ones = np.ones_like(rot_cos)
zeros = np.zeros_like(rot_cos)
if axis == 1: # Y axis (X=right, Y=down, Z=forward)
rot_mat = np.stack([
np.stack([rot_cos, zeros, -rot_sin]),
np.stack([zeros, ones, zeros]),
np.stack([rot_sin, zeros, rot_cos]),
])
elif axis == 2:
rot_mat = np.stack([
np.stack([rot_cos, rot_sin, zeros]),
np.stack([-rot_sin, rot_cos, zeros]),
np.stack([zeros, zeros, ones]),
])
elif axis == 0:
rot_mat = np.stack([
np.stack([ones, zeros, zeros]),
np.stack([zeros, rot_cos, rot_sin]),
np.stack([zeros, -rot_sin, rot_cos]),
])
else:
raise ValueError(f"axis should be in [0, 1, 2], got {axis}")
return np.dot(points, rot_mat)
def compute_3d_box_corners(center_3d, dimensions, rotation, face_type=-1):
"""Compute 8 corners of a 3D bounding box.
When face_type >= 0, center_3d is the center of that face (not box center).
Args:
center_3d: (x, y, z) center position in camera coordinates.
dimensions: (length, height, width) of the box.
rotation: rot_y (rotation around y-axis in radians).
face_type: -1=box center, 0=front, 1=rear, 2=left, 3=right.
Returns:
corners: (8, 3) array of corner coordinates.
"""
l, h, w = dimensions
# 8 corners via unravel_index pattern, reordered
corners_norm = np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1).astype(np.float64)
corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
# Offset based on face type
offsets = {0: [1, 0.5, 0.5], 1: [0, 0.5, 0.5], 2: [0.5, 0.5, 1], 3: [0.5, 0.5, 0]}
corners_norm -= offsets.get(face_type, [0.5, 0.5, 0.5])
# Scale by dimensions and rotate
corners = np.array([l, h, w]).reshape(1, 3) * corners_norm.reshape(8, 3)
corners = rotation_3d_in_axis(corners, rotation, axis=1)
corners += np.array(center_3d).reshape(1, 3)
return corners
def apply_fisheye_distortion(x, y, distort_coeffs):
"""Apply Kannala-Brandt fisheye distortion to normalized camera coordinates."""
if distort_coeffs is None or len(distort_coeffs) < 4:
return x, y
k1, k2, k3, k4 = distort_coeffs[:4]
r = np.sqrt(x * x + y * y)
if r < 1e-8:
return x, y
theta = np.arctan(r)
theta2 = theta * theta
theta4 = theta2 * theta2
theta6 = theta4 * theta2
theta8 = theta4 * theta4
theta_d = theta * (1 + k1 * theta2 + k2 * theta4 + k3 * theta6 + k4 * theta8)
scale = theta_d / r
return x * scale, y * scale
def remove_fisheye_distortion(xd, yd, distort_coeffs, max_iter=20):
"""Remove Kannala-Brandt fisheye distortion from normalized camera coordinates."""
if distort_coeffs is None or len(distort_coeffs) < 4:
return xd, yd
k1, k2, k3, k4 = distort_coeffs[:4]
r_d = np.sqrt(xd * xd + yd * yd)
if r_d < 1e-8:
return xd, yd
theta_d = r_d
theta_d2 = theta_d * theta_d
theta = theta_d / (1 + k1 * theta_d2)
for _ in range(max_iter):
theta2 = theta * theta
theta4 = theta2 * theta2
theta6 = theta4 * theta2
theta8 = theta4 * theta4
f = theta * (1 + k1 * theta2 + k2 * theta4 + k3 * theta6 + k4 * theta8) - theta_d
f_prime = 1 + 3 * k1 * theta2 + 5 * k2 * theta4 + 7 * k3 * theta6 + 9 * k4 * theta8
theta_new = theta - f / f_prime
if abs(theta_new - theta) < 1e-8:
theta = theta_new
break
theta = theta_new
r = np.tan(theta)
scale = r / r_d
return xd * scale, yd * scale
def project_3d_to_2d_with_distortion(points_3d, calib):
"""Project 3D points with fisheye distortion-aware calibration."""
fx, fy = calib["fx"], calib["fy"]
cx, cy = calib["cx"], calib["cy"]
distort_coeffs = calib.get("distort_coeffs", [])
points_2d = np.full((len(points_3d), 2), np.nan)
for i, (x, y, z) in enumerate(points_3d):
if z > 0.1:
xn, yn = x / z, y / z
xd, yd = apply_fisheye_distortion(xn, yn, distort_coeffs)
points_2d[i] = [fx * xd + cx, fy * yd + cy]
return points_2d
def project_3d_to_2d_with_calib(points_3d, calib):
"""Project 3D points with standard pinhole calibration."""
fx, fy = calib["fx"], calib["fy"]
cx, cy = calib["cx"], calib["cy"]
points_2d = np.full((len(points_3d), 2), np.nan)
for i, (x, y, z) in enumerate(points_3d):
if z > 0.1:
points_2d[i] = [fx * x / z + cx, fy * y / z + cy]
return points_2d
def project_3d_to_2d(points_3d, calib):
"""Project 3D points to 2D using the provided calibration model."""
if calib is None:
return np.full((len(points_3d), 2), np.nan)
if calib.get("distort_coeffs") is not None and len(calib.get("distort_coeffs", [])) >= 4:
return project_3d_to_2d_with_distortion(points_3d, calib)
return project_3d_to_2d_with_calib(points_3d, calib)
def sample_3d_edge(p1, p2, num_samples=10):
"""Sample 3D points uniformly along a box edge."""
t = np.linspace(0, 1, num_samples).reshape(-1, 1)
return p1 + t * (p2 - p1)
def _point_inside_image(point_2d, img_w, img_h):
"""Return whether a projected point lies inside the image bounds."""
x, y = float(point_2d[0]), float(point_2d[1])
return np.isfinite(x) and np.isfinite(y) and 0.0 <= x <= img_w - 1 and 0.0 <= y <= img_h - 1
def _solve_edge_image_boundary_t(p0_2d, p1_2d, img_w, img_h):
"""Return the parametric interval whose projected segment lies inside the image."""
p0 = np.asarray(p0_2d, dtype=np.float64)
p1 = np.asarray(p1_2d, dtype=np.float64)
if not np.isfinite(p0).all() or not np.isfinite(p1).all():
return None
dx, dy = p1 - p0
t_min, t_max = 0.0, 1.0
for p, q in ((-dx, p0[0]), (dx, (img_w - 1) - p0[0]), (-dy, p0[1]), (dy, (img_h - 1) - p0[1])):
if abs(p) < 1e-12:
if q < 0:
return None
continue
t = q / p
if p < 0:
t_min = max(t_min, t)
else:
t_max = min(t_max, t)
if t_min > t_max:
return None
return t_min, t_max
def _project_edge_point_at_t(p1, p2, t, calib):
"""Project a single parametric point on a 3D edge."""
point_3d = np.asarray(p1, dtype=np.float64) + float(t) * (np.asarray(p2, dtype=np.float64) - np.asarray(p1, dtype=np.float64))
point_2d = project_3d_to_2d(point_3d[None, :], calib)[0]
return point_3d, point_2d
def _refine_visible_edge_boundary(p1, p2, calib, img_w, img_h, t_out, t_in, steps=12):
"""Refine one visible/hidden transition on a projected 3D edge."""
lo, hi = (float(t_out), float(t_in)) if t_out < t_in else (float(t_in), float(t_out))
for _ in range(steps):
mid = 0.5 * (lo + hi)
_, point_2d = _project_edge_point_at_t(p1, p2, mid, calib)
if _point_inside_image(point_2d, img_w, img_h):
hi = mid
else:
lo = mid
return hi if t_out < t_in else lo
def sample_partial_3d_edge(p1, p2, calib, img_w, img_h, num_samples=5, dense_samples=129):
"""Sample exactly ``num_samples`` points from the visible sub-segment of a projected 3D edge."""
endpoints_3d = np.asarray([p1, p2], dtype=np.float64)
dense_t = np.linspace(0.0, 1.0, dense_samples, dtype=np.float64)
dense_points_3d = endpoints_3d[0:1] + dense_t[:, None] * (endpoints_3d[1:2] - endpoints_3d[0:1])
dense_points_2d = project_3d_to_2d(dense_points_3d, calib)
visible = np.array([_point_inside_image(point_2d, img_w, img_h) for point_2d in dense_points_2d], dtype=bool)
if not visible.any():
return None, None
visible_idx = np.flatnonzero(visible)
split_idx = np.where(np.diff(visible_idx) > 1)[0] + 1
visible_runs = np.split(visible_idx, split_idx)
visible_run = max(visible_runs, key=len)
first_idx, last_idx = int(visible_run[0]), int(visible_run[-1])
t_start = dense_t[first_idx]
if first_idx > 0:
t_start = _refine_visible_edge_boundary(
endpoints_3d[0], endpoints_3d[1], calib, img_w, img_h, dense_t[first_idx - 1], dense_t[first_idx]
)
t_end = dense_t[last_idx]
if last_idx < len(dense_t) - 1:
t_end = _refine_visible_edge_boundary(
endpoints_3d[0], endpoints_3d[1], calib, img_w, img_h, dense_t[last_idx + 1], dense_t[last_idx]
)
if t_end - t_start < 1e-6:
return None, None
sample_t = np.linspace(t_start, t_end, num_samples, dtype=np.float64)
sample_points_3d = endpoints_3d[0:1] + sample_t[:, None] * (endpoints_3d[1:2] - endpoints_3d[0:1])
sample_points_2d = project_3d_to_2d(sample_points_3d, calib)
if np.any(np.isnan(sample_points_2d)):
return None, None
if not np.all([_point_inside_image(point_2d, img_w, img_h) for point_2d in sample_points_2d]):
return None, None
order = np.argsort(sample_points_2d[:, 0], kind="stable")
return sample_points_3d[order], sample_points_2d[order]
def project_3d_box_edges_with_distortion(corners_3d, calib, samples_per_edge=10):
"""Project sampled 3D box edges for distortion-aware wireframe drawing."""
edges = {
"back_0": (4, 5), "back_1": (5, 6), "back_2": (6, 7), "back_3": (7, 4),
"connect_0": (0, 4), "connect_1": (1, 5), "connect_2": (2, 6), "connect_3": (3, 7),
"front_0": (0, 1), "front_1": (1, 2), "front_2": (2, 3), "front_3": (3, 0),
"front_x1": (0, 2), "front_x2": (1, 3),
}
edge_points_2d = {}
for edge_name, (i, j) in edges.items():
sampled_3d = sample_3d_edge(corners_3d[i], corners_3d[j], samples_per_edge)
edge_points_2d[edge_name] = project_3d_to_2d_with_distortion(sampled_3d, calib)
return edge_points_2d
def plot_box3d_on_img_with_distortion(
img, edge_points_2d, color_front=(0, 0, 255), color_back=(255, 0, 0), color_side=(255, 255, 0), thickness=1
):
"""Draw a 3D box using distortion-aware projected edge samples."""
front_edges = {"front_0", "front_1", "front_2", "front_3", "front_x1", "front_x2"}
back_edges = {"back_0", "back_1", "back_2", "back_3", "back_x1", "back_x2"}
for edge_name, points in edge_points_2d.items():
if np.any(np.isnan(points)):
continue
pts = points.astype(np.int32)
color = color_front if edge_name in front_edges else color_back if edge_name in back_edges else color_side
cv2.polylines(img, [pts], isClosed=False, color=color, thickness=thickness, lineType=cv2.LINE_AA)
return img
def plot_box3d_on_img(img, corners_2d, color_front=(0, 0, 255), color_back=(255, 0, 0), color_side=(255, 255, 0), thickness=1):
"""Draw a 3D wireframe box from projected 2D corners."""
line_indices = (
(4, 5), (5, 6), (6, 7), (7, 4),
(0, 4), (1, 5), (2, 6), (3, 7),
(0, 1), (1, 2), (2, 3), (3, 0), (0, 2), (1, 3),
)
front_edges = {(0, 1), (1, 2), (2, 3), (3, 0), (0, 2), (1, 3)}
back_edges = {(4, 5), (5, 6), (6, 7), (7, 4)}
pts = corners_2d.astype(np.int32)
for i, j in line_indices:
color = color_front if (i, j) in front_edges else color_back if (i, j) in back_edges else color_side
cv2.line(img, tuple(pts[i]), tuple(pts[j]), color, thickness, cv2.LINE_AA)
return img
def back_project_2d_to_3d(uv, depth, calib):
"""Back-project a pixel point to camera coordinates, removing distortion when needed."""
if calib is None or depth <= 0:
return None
fx, fy = calib["fx"], calib["fy"]
cx, cy = calib["cx"], calib["cy"]
u, v = uv
xd = (u - cx) / fx
yd = (v - cy) / fy
distort_coeffs = calib.get("distort_coeffs", [])
if distort_coeffs is not None and len(distort_coeffs) >= 4:
xn, yn = remove_fisheye_distortion(xd, yd, distort_coeffs)
else:
xn, yn = xd, yd
return np.array([xn * depth, yn * depth, depth], dtype=np.float64)
def reconstruct_3d_box_from_face(face_uv, face_z, dims, rot_y, face_type, calib):
"""Reconstruct 3D box corners from a visible face center."""
if calib is None or face_z <= 0:
return None
center_3d = back_project_2d_to_3d(face_uv, face_z, calib)
if center_3d is None:
return None
l, h, w = dims
if any(np.isnan(x) for x in (l, h, w, rot_y)):
return None
return compute_3d_box_corners(center_3d, dims, rot_y, face_type)
def reconstruct_3d_box_from_whole(uv, z3d, dims, rot_y, calib):
"""Reconstruct 3D box corners from whole-box center."""
if calib is None or z3d <= 0:
return None
center_3d = back_project_2d_to_3d(uv, z3d, calib)
if center_3d is None:
return None
l, h, w = dims
if any(np.isnan(x) for x in (l, h, w, rot_y)):
return None
return compute_3d_box_corners(center_3d, dims, rot_y, face_type=-1)
def get_face_bottom_edge_points(corners_3d, face_type, num_samples=5):
"""Sample points along the requested visible face bottom edge."""
if corners_3d is None or face_type not in FACE_BOTTOM_EDGE_CORNERS:
return None
start_idx, end_idx = FACE_BOTTOM_EDGE_CORNERS[face_type]
points_3d = sample_3d_edge(corners_3d[start_idx], corners_3d[end_idx], num_samples=num_samples)
return points_3d
def project_face_bottom_edge(corners_3d, face_type, calib, num_samples=5):
"""Project sampled visible-face bottom-edge points to the image plane."""
points_3d = get_face_bottom_edge_points(corners_3d, face_type, num_samples=num_samples)
if points_3d is None:
return None, None
points_2d = project_3d_to_2d(points_3d, calib)
if np.any(np.isnan(points_2d)):
return points_3d, None
order = np.argsort(points_2d[:, 0], kind="stable")
return points_3d[order], points_2d[order]
def project_partial_face_bottom_edge(corners_3d, face_type, calib, img_w, img_h, num_samples=5):
"""Project exactly ``num_samples`` points from the visible sub-segment of a face bottom edge."""
if corners_3d is None or face_type not in FACE_BOTTOM_EDGE_CORNERS:
return None, None
start_idx, end_idx = FACE_BOTTOM_EDGE_CORNERS[face_type]
return sample_partial_3d_edge(corners_3d[start_idx], corners_3d[end_idx], calib, img_w, img_h, num_samples=num_samples)
def collect_face_bottom_edges(corners_3d, face_types, calib, num_samples=5):
"""Project sampled bottom-edge points for all requested visible faces."""
if corners_3d is None:
return None, None
edge_points_3d, edge_points_2d = [], []
for face_type in face_types:
points_3d, points_2d = project_face_bottom_edge(corners_3d, face_type, calib, num_samples=num_samples)
if points_3d is None or points_2d is None:
continue
edge_points_3d.append(points_3d.astype(np.float32, copy=False))
edge_points_2d.append(points_2d.astype(np.float32, copy=False))
if not edge_points_2d:
return None, None
if len(edge_points_2d) == 1:
return edge_points_3d[0], edge_points_2d[0]
return np.stack(edge_points_3d, axis=0), np.stack(edge_points_2d, axis=0)
def _edge_batches_to_list(edge_points):
"""Normalize edge sample arrays to a list of `(5, D)` arrays."""
if edge_points is None:
return []
arr = np.asarray(edge_points, dtype=np.float32)
if arr.ndim == 2:
return [arr]
return [arr[i] for i in range(arr.shape[0])]
def _stack_edge_batches(edge_batches):
"""Convert a list of edge sample arrays back to the legacy stacked representation."""
if not edge_batches:
return None
if len(edge_batches) == 1:
return edge_batches[0]
return np.stack(edge_batches, axis=0)
def _append_edge_batch(edge_points_3d, edge_points_2d, decoded_edge):
"""Append one decoded edge sample set to stacked edge arrays."""
if decoded_edge is None:
return edge_points_3d, edge_points_2d
edge3d_list = _edge_batches_to_list(edge_points_3d)
edge2d_list = _edge_batches_to_list(edge_points_2d)
edge3d_list.append(np.asarray(decoded_edge["points_3d"], dtype=np.float32))
edge2d_list.append(np.asarray(decoded_edge["points_2d"], dtype=np.float32))
return _stack_edge_batches(edge3d_list), _stack_edge_batches(edge2d_list)
def collect_precomputed_edge_points_2d(edge_faces_points_2d, edge_faces_valid=None, visible_face_types=()):
"""Convert one object's precomputed face-edge tensors into drawable polyline batches."""
if edge_faces_points_2d is None:
return None
points = np.asarray(edge_faces_points_2d, dtype=np.float32)
if points.ndim != 3 or points.shape[0] == 0:
return None
if edge_faces_valid is None:
valid = np.ones(points.shape[0], dtype=bool)
else:
valid = np.asarray(edge_faces_valid, dtype=bool).reshape(-1)
if valid.shape[0] < points.shape[0]:
valid = np.pad(valid, (0, points.shape[0] - valid.shape[0]), constant_values=False)
else:
valid = valid[: points.shape[0]]
face_order = []
for face_type in visible_face_types or ():
face_type = int(face_type)
if 0 <= face_type < points.shape[0] and valid[face_type] and face_type not in face_order:
face_order.append(face_type)
for face_type in np.flatnonzero(valid):
face_type = int(face_type)
if face_type not in face_order:
face_order.append(face_type)
if not face_order:
return None
return _stack_edge_batches([points[face_type].astype(np.float32, copy=False) for face_type in face_order])
def decode_visible_face_edge_from_prediction(pred_edge_60, face_type, anchor_xy, stride):
"""Decode one face block of auxiliary edge predictions into pixel UV and depth samples."""
if pred_edge_60 is None or face_type not in range(4):
return None
off = FACE_EDGE_OFFSETS_60[face_type]
face = np.asarray(pred_edge_60[off : off + 15], dtype=np.float32).reshape(5, 3)
points_2d = np.empty((5, 2), dtype=np.float32)
points_2d[:, 0] = (anchor_xy[0] + face[:, 0]) * stride
points_2d[:, 1] = (anchor_xy[1] + face[:, 1]) * stride
order = np.argsort(points_2d[:, 0], kind="stable")
return {
"points_2d": points_2d[order],
"depths": face[order, 2].astype(np.float32),
"face_type": face_type,
}
def _is_gt_face_cut(target_42, face_type):
"""Return whether a GT face was invalidated by crop handling."""
if face_type not in range(4):
return False
off = FACE_OFFSETS_42[face_type]
face = target_42[off : off + 8]
return np.all(face[:6] == -1) and face[7] <= 0
def get_gt_cut_state(target_42):
"""Return cut-object state from the GT face invalidation pattern."""
if target_42 is None or len(target_42) < 42:
return CUT_STATE_NORMAL
f_cut = _is_gt_face_cut(target_42, 0)
r_cut = _is_gt_face_cut(target_42, 1)
l_cut = _is_gt_face_cut(target_42, 2)
ri_cut = _is_gt_face_cut(target_42, 3)
if r_cut and l_cut and ri_cut:
return CUT_STATE_IN
if f_cut and l_cut and ri_cut:
return CUT_STATE_OUT
return CUT_STATE_NORMAL
def get_gt_cut_side(target_42, img_w, img_h, tol=1e-4, score_thr=FACE_VISIBILITY_SCORE_THRESH):
"""Infer whether a cut GT object is clipped on the left or right image border."""
visible_faces = []
for face_type, off in enumerate(FACE_OFFSETS_42):
face = target_42[off : off + 8]
if face[7] != 1 or np.isnan(face[6]) or face[6] < score_thr:
continue
if np.isnan(face[4]) or np.isnan(face[5]) or face[4] < 0 or face[5] < 0:
continue
visible_faces.append((face_type, face[4] * img_w, face[5] * img_h, float(face[6])))
if not visible_faces:
return None
_, best_u, _, _ = max(visible_faces, key=lambda item: item[3])
edge_u = best_u
side_faces = []
for face_type in (2, 3):
off = FACE_OFFSETS_42[face_type]
face = target_42[off : off + 8]
if np.isnan(face[4]) or face[4] < 0:
continue
side_faces.append((face_type, face[4] * img_w))
if side_faces:
edge_u = side_faces[0][1] if len(side_faces) == 1 else float(np.mean([item[1] for item in side_faces]))
if edge_u <= tol:
return "left"
if edge_u >= img_w - 1 - tol:
return "right"
return None
def get_cut_side_from_bbox_xyxy(bbox_xyxy, img_w, tol=1.0):
"""Infer whether a clipped box touches the left or right image border."""
if bbox_xyxy is None:
return None
x1, _, x2, _ = np.asarray(bbox_xyxy, dtype=np.float64)
touch_left = x1 <= tol and x2 > tol
touch_right = x2 >= img_w - 1 - tol and x1 < img_w - 1 - tol
if touch_left == touch_right:
return None
return "left" if touch_left else "right"
def _get_camera_facing_side_face_from_corners(corners_3d):
"""Return the side face whose outward normal points most toward the camera."""
if corners_3d is None:
return None
corners = np.asarray(corners_3d, dtype=np.float64)
if corners.shape != (8, 3) or not np.isfinite(corners).all():
return None
box_center = corners.mean(axis=0)
best_face_type, best_score = None, -np.inf
for face_type in (2, 3):
face_points = corners[list(FACE_CORNERS[face_type])]
face_center = face_points.mean(axis=0)
view_dir = -face_center
view_norm = float(np.linalg.norm(view_dir))
if view_norm < 1e-8:
continue
edge_a = face_points[1] - face_points[0]
edge_b = face_points[2] - face_points[1]
normal = np.cross(edge_a, edge_b)
normal_norm = float(np.linalg.norm(normal))
if normal_norm < 1e-8:
continue
if np.dot(normal, face_center - box_center) < 0:
normal = -normal
score = float(np.dot(normal / normal_norm, view_dir / view_norm))
if score > best_score:
best_face_type, best_score = face_type, score
return best_face_type
def get_cut_object_side_face(face_type_or_state, cut_side=None, corners_3d=None):
"""Resolve the partially visible side face for a cut object.
Prefer reconstructed box geometry when available so the near side can change with yaw.
Fall back to the historical image-border heuristic when only the crop side is known.
"""
if face_type_or_state not in {CUT_STATE_IN, CUT_STATE_OUT}:
return None
side_face_type = _get_camera_facing_side_face_from_corners(corners_3d)
if side_face_type in (2, 3):
return side_face_type
if cut_side not in {"left", "right"}:
return None
return 3 if cut_side == "left" else 2
def get_cut_object_side_face_from_yaw(cut_state, yaw):
"""Infer the partially visible side face from cut state and whole-box yaw."""
if cut_state == CUT_STATE_IN:
return 3 if np.sin(float(yaw)) > 0 else 2
if cut_state == CUT_STATE_OUT:
return 2 if np.sin(float(yaw)) < 0 else 3
return None
def get_pred_cut_state(pred_41):
"""Return predicted cut state from the cut classification logits."""
cut_logits = np.asarray(pred_41[38:41], dtype=np.float32)
return int(np.argmax(cut_logits))
def get_pred_cut_primary_face(cut_state):
"""Return the mandated longitudinal visible face for a cut prediction."""
if cut_state == CUT_STATE_IN:
return 0
if cut_state == CUT_STATE_OUT:
return 1
return None
def _reconstruct_pred_corners_for_cut_edge(pred_41, anchor_xy, stride, calib, cut_state=None):
"""Reconstruct predicted box corners for cut-edge side-face selection."""
if calib is None:
return None
cut_state = get_pred_cut_state(pred_41) if cut_state is None else int(cut_state)
dims = np.asarray(pred_41[27:30], dtype=np.float32)
rot_y = _decode_yaw_from_prediction(pred_41)
if np.any(np.isnan(dims)) or not np.isfinite(rot_y):
return None
primary_face = get_pred_cut_primary_face(cut_state)
if primary_face is not None:
off = FACE_OFFSETS_41[primary_face]
z_face = float(pred_41[off])
uv_face_offset = np.asarray(pred_41[off + 1 : off + 3], dtype=np.float32)
if np.isfinite(z_face) and z_face > 0 and np.isfinite(uv_face_offset).all():
u_face = float((anchor_xy[0] + uv_face_offset[0]) * stride)
v_face = float((anchor_xy[1] + uv_face_offset[1]) * stride)
corners = reconstruct_3d_box_from_face((u_face, v_face), z_face, dims, rot_y, primary_face, calib)
if corners is not None:
return corners
z_whole = float(pred_41[24])
uv_whole_offset = np.asarray(pred_41[25:27], dtype=np.float32)
if not np.isfinite(z_whole) or z_whole <= 0 or not np.isfinite(uv_whole_offset).all():
return None
u_whole = float((anchor_xy[0] + uv_whole_offset[0]) * stride)
v_whole = float((anchor_xy[1] + uv_whole_offset[1]) * stride)
return reconstruct_3d_box_from_whole((u_whole, v_whole), z_whole, dims, rot_y, calib)
def _resolve_pred_cut_state_for_decode(pred_41, bbox_xyxy=None, img_w=None):
"""Resolve predicted cut state only when the box is actually clipped at the image border."""
cut_state = get_pred_cut_state(pred_41)
if cut_state == CUT_STATE_NORMAL:
return cut_state, None
cut_side = None
if bbox_xyxy is not None and img_w is not None:
cut_side = get_cut_side_from_bbox_xyxy(bbox_xyxy, img_w)
if cut_side not in {"left", "right"}:
return CUT_STATE_NORMAL, None
return cut_state, cut_side
def _select_best_pred_face_score(pred_41):
"""Return the highest-scoring predicted face without applying a visibility threshold."""
best_face_type, best_score = None, float("-inf")
for face_type, off in enumerate(FACE_OFFSETS_41):
score = float(pred_41[off + 5])
if not np.isfinite(score):
continue
if score > best_score:
best_face_type = int(face_type)
best_score = float(score)
if best_face_type is None:
return None
return best_face_type, best_score
def select_pred_visible_faces_for_decode(pred_41, score_thr=FACE_VISIBILITY_SCORE_THRESH, bbox_xyxy=None, img_w=None):
"""Return visible faces used for decoding and drawing.
For cut objects we enforce the intended semantics:
- cut_in -> front face only
- cut_out -> rear face only
For normal objects we keep the thresholded visible-face list, but always retain the top1 face
even if its score is below the threshold. The partial side edge is handled separately by the cut-edge decoder.
"""
cut_state, _ = _resolve_pred_cut_state_for_decode(pred_41, bbox_xyxy=bbox_xyxy, img_w=img_w)
primary_face = get_pred_cut_primary_face(cut_state)
if primary_face is not None:
off = FACE_OFFSETS_41[primary_face]
return [(primary_face, float(pred_41[off + 5]))]
visible_faces = list(select_pred_visible_faces(pred_41, score_thr=score_thr))
best_face = _select_best_pred_face_score(pred_41)
if best_face is None:
return visible_faces
best_face_type, best_score = best_face
if all(int(face_type) != int(best_face_type) for face_type, _ in visible_faces):
visible_faces.append((int(best_face_type), float(best_score)))
return visible_faces
def decode_cut_partial_side_edge_from_prediction(
pred_41, pred_edge_60, anchor_xy, stride, img_w, cut_side=None, calib=None, corners_3d=None
):
"""Decode the partially visible side bottom edge for a cut prediction."""
if pred_edge_60 is None:
return None
cut_state = get_pred_cut_state(pred_41)
if cut_state == CUT_STATE_NORMAL:
return None
if corners_3d is None and calib is not None:
corners_3d = _reconstruct_pred_corners_for_cut_edge(pred_41, anchor_xy, stride, calib, cut_state=cut_state)
side_face_type = get_cut_object_side_face(cut_state, cut_side, corners_3d=corners_3d)
if side_face_type is None:
return None
decoded = decode_visible_face_edge_from_prediction(pred_edge_60, side_face_type, anchor_xy, stride)
if decoded is None:
return None
decoded["cut_state"] = cut_state
decoded["cut_side"] = cut_side
decoded["is_partial"] = True
return decoded
def _resolve_gt_cut_partial_side_face(target_42, img_w, img_h, bbox_xyxy=None, score_thr=FACE_VISIBILITY_SCORE_THRESH):
"""Resolve cut-object metadata needed to decode the partial side edge."""
cut_state = get_gt_cut_state(target_42)
if cut_state == CUT_STATE_NORMAL:
return cut_state, None
cut_side = get_cut_side_from_bbox_xyxy(bbox_xyxy, img_w)
if cut_side is None:
cut_side = get_gt_cut_side(target_42, img_w, img_h, score_thr=score_thr)
return cut_state, cut_side
def _reconstruct_gt_corners_for_cut_edge(
target_42, cls_id, calib, img_w, img_h, face_3d_classes, complete_3d_classes, score_thr=FACE_VISIBILITY_SCORE_THRESH
):
"""Reconstruct GT box corners using the same geometry source as box visualization when possible."""
if calib is None:
return None
depth_scale = calib.get("depth_scale", 1.0)
dims = target_42[3:6].astype(np.float32)
rot_y = float(target_42[6])
if np.any(np.isnan(dims)) or not np.isfinite(rot_y):
return None
if cls_id in face_3d_classes:
visible_faces = select_gt_visible_faces(target_42, score_thr=score_thr)
if visible_faces:
best_type, best_face = max(visible_faces, key=lambda item: float(item[1][6]))
u_face = float(best_face[4] * img_w)
v_face = float(best_face[5] * img_h)
z_face = float(best_face[2] * depth_scale)
if np.isfinite(u_face) and np.isfinite(v_face) and np.isfinite(z_face) and z_face > 0:
corners = reconstruct_3d_box_from_face((u_face, v_face), z_face, dims, rot_y, best_type, calib)
if corners is not None:
return corners
if cls_id not in face_3d_classes and cls_id not in complete_3d_classes:
return None
z3d = float(target_42[2])
whole_uv = target_42[7:9]
if np.any(np.isnan(whole_uv)) or not np.isfinite(z3d) or z3d <= 0:
return None
return reconstruct_3d_box_from_whole(
(float(whole_uv[0] * img_w), float(whole_uv[1] * img_h)), float(z3d * depth_scale), dims, rot_y, calib
)
def decode_cut_partial_side_edge_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
bbox_xyxy=None,
corners_3d=None,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
):
"""Decode the partially visible side bottom edge for a cut GT object."""
if cls_id not in face_3d_classes:
return None
cut_state, cut_side = _resolve_gt_cut_partial_side_face(target_42, img_w, img_h, bbox_xyxy=bbox_xyxy, score_thr=score_thr)
if cut_side not in {"left", "right"}:
return None
corners = corners_3d
if corners is None:
corners = _reconstruct_gt_corners_for_cut_edge(
target_42, cls_id, calib, img_w, img_h, face_3d_classes, complete_3d_classes, score_thr=score_thr
)
if corners is None:
return None
side_face_type = get_cut_object_side_face(cut_state, cut_side, corners_3d=corners)
if side_face_type is None or not _is_gt_face_cut(target_42, side_face_type):
return None
points_3d, points_2d = project_partial_face_bottom_edge(corners, side_face_type, calib, img_w, img_h, num_samples=5)
if points_3d is None or points_2d is None:
return None
return {
"points_3d": points_3d.astype(np.float32),
"points_2d": points_2d.astype(np.float32),
"depths": points_3d[:, 2].astype(np.float32),
"face_type": side_face_type,
"cut_state": cut_state,
"cut_side": cut_side,
"is_partial": True,
}
def decode_visible_face_edge_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
face_type=None,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
bbox_xyxy=None,
):
"""Decode GT visible-face bottom-edge samples from the current camera geometry."""
if cls_id not in face_3d_classes:
return None
partial_edge = decode_cut_partial_side_edge_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
bbox_xyxy=bbox_xyxy,
score_thr=score_thr,
)
if partial_edge is not None and (face_type is None or face_type == partial_edge["face_type"]):
return partial_edge
target_decoded = decode_3d_target(
target_42, cls_id, calib, img_w, img_h, face_3d_classes, complete_3d_classes, score_thr=score_thr
)
if target_decoded is None or target_decoded.get("corners_3d") is None:
return None
visible_face_types = tuple(int(face_type) for face_type, _ in select_gt_visible_faces(target_42, score_thr=score_thr))
selected_face = target_decoded.get("visible_face_type") if face_type is None else face_type
if selected_face not in range(4):
return None
if face_type is not None and selected_face not in visible_face_types:
return None
points_3d, points_2d = project_face_bottom_edge(target_decoded["corners_3d"], selected_face, calib, num_samples=5)
if points_3d is None or points_2d is None:
return None
return {
"points_3d": points_3d.astype(np.float32),
"points_2d": points_2d.astype(np.float32),
"depths": points_3d[:, 2].astype(np.float32),
"face_type": selected_face,
}
def _decoded_edge_to_points_3d(decoded_edge, calib):
"""Back-project one decoded edge sample set into 3D camera coordinates."""
if decoded_edge is None:
return None
points_3d = []
for pt, depth in zip(decoded_edge["points_2d"], decoded_edge["depths"]):
point_3d = back_project_2d_to_3d(tuple(pt), float(depth), calib)
if point_3d is None:
return None
points_3d.append(point_3d)
return np.asarray(points_3d, dtype=np.float32)
def _decoded_edge_points_are_drawable(points_2d, img_w=None, img_h=None, min_endpoint_dist_px=2.0):
"""Return whether decoded edge points correspond to a visible, drawable in-image segment."""
if points_2d is None:
return False
pts = np.asarray(points_2d, dtype=np.float32)
if pts.ndim != 2 or pts.shape[0] < 2 or pts.shape[1] != 2 or not np.isfinite(pts).all():
return False
if img_w is not None and img_h is not None:
if not np.all([_point_inside_image(point_2d, img_w, img_h) for point_2d in pts]):
return False
endpoint_dist = float(np.linalg.norm(pts[-1] - pts[0]))
return endpoint_dist >= float(min_endpoint_dist_px)
def _edge_segment_length_3d(points_3d):
"""Return the visible BEV length of one decoded bottom-edge segment.
Bottom-edge size recovery should ignore vertical noise in the decoded points and only measure the
ground-plane extent (x/z).
"""
if points_3d is None:
return None
pts = np.asarray(points_3d, dtype=np.float32)
if pts.ndim != 2 or pts.shape[0] < 2 or pts.shape[1] != 3 or not np.isfinite(pts).all():
return None
return float(np.linalg.norm(pts[-1, [0, 2]] - pts[0, [0, 2]]))
def _prediction_lateral_distance_m_from_center(center):
"""Return absolute lateral distance from any predicted metric-space anchor center."""
if center is None:
return None
center = np.asarray(center, dtype=np.float32).reshape(-1)
if center.shape[0] < 1 or not np.isfinite(center[0]):
return None
return float(abs(center[0]))
def edge_points_to_yaw(points_3d, face_type):
"""Infer whole-box yaw from visible-face bottom-edge 3D samples."""
if points_3d is None or len(points_3d) < 2 or face_type not in range(4):
return float("nan")
pts = np.asarray(points_3d, dtype=np.float64)
valid = np.isfinite(pts).all(axis=1)
pts = pts[valid]
if len(pts) < 2:
return float("nan")
tangent = np.array([pts[-1, 0] - pts[0, 0], pts[-1, 2] - pts[0, 2]], dtype=np.float64)
tangent_norm = float(np.linalg.norm(tangent))
if tangent_norm < 1e-8:
return float("nan")
tangent /= tangent_norm
midpoint = np.mean(pts[:, [0, 2]], axis=0)
def _rot_cw(v):
return np.array([v[1], -v[0]], dtype=np.float64)
def _rot_ccw(v):
return np.array([-v[1], v[0]], dtype=np.float64)
if face_type in (0, 1):
forward_candidates = (_rot_cw(tangent), -_rot_cw(tangent))
else:
forward_candidates = (tangent, -tangent)
def _face_normal(forward):
if face_type == 0:
return forward
if face_type == 1:
return -forward
if face_type == 2:
return _rot_ccw(forward)
return -_rot_ccw(forward)
# The edge samples arrive sorted left-to-right in image space, so the tangent has an
# unavoidable 180-degree ambiguity in world space. Resolve it by selecting the forward
# direction whose face normal points most toward the camera for the requested visible face.
best_forward = min(forward_candidates, key=lambda forward: float(np.dot(_face_normal(forward), midpoint)))
yaw = np.arctan2(-best_forward[1], best_forward[0])
return float((yaw + np.pi) % (2 * np.pi) - np.pi)
def visible_face_edges_to_yaw(face_edges_3d, face_scores=None):
"""Estimate whole-box yaw from one or more visible-face bottom edges."""
if face_edges_3d is None:
return float("nan")
items = list(face_edges_3d.items() if hasattr(face_edges_3d, "items") else face_edges_3d)
weighted_candidates = []
for face_type, points_3d in items:
weight = 1.0
if face_scores is not None:
if hasattr(face_scores, "get"):
weight = face_scores.get(face_type, 1.0)
else:
weight = face_scores[face_type]
if not np.isfinite(weight) or weight <= 0:
weight = 1.0
weighted_candidates.append(
{
"face_type": int(face_type),
"points_3d": np.asarray(points_3d, dtype=np.float32),
"score": float(weight),
}
)
longitudinal_candidates = [candidate for candidate in weighted_candidates if candidate["face_type"] in (0, 1)]
side_candidates = [candidate for candidate in weighted_candidates if candidate["face_type"] in (2, 3)]
if longitudinal_candidates and side_candidates:
longitudinal_candidate = max(longitudinal_candidates, key=lambda item: item["score"])
side_candidate = max(side_candidates, key=lambda item: item["score"])
yaw = _estimate_two_edge_yaw_from_candidates(longitudinal_candidate, side_candidate)
if np.isfinite(yaw):
return yaw
yaws, weights = [], []
for face_type, points_3d in items:
yaw = edge_points_to_yaw(points_3d, face_type)
if not np.isfinite(yaw):
continue
weight = 1.0
if face_scores is not None:
if hasattr(face_scores, "get"):
weight = face_scores.get(face_type, 1.0)
else:
weight = face_scores[face_type]
if not np.isfinite(weight) or weight <= 0:
weight = 1.0
yaws.append(float(yaw))
weights.append(float(weight))
if not yaws:
return float("nan")
if len(yaws) == 1:
return float(yaws[0])
forward = np.stack([np.cos(yaws), -np.sin(yaws)], axis=1)
mean_forward = np.sum(forward * np.asarray(weights, dtype=np.float64)[:, None], axis=0)
norm = float(np.linalg.norm(mean_forward))
if norm < 1e-8:
return float(yaws[int(np.argmax(weights))])
mean_forward /= norm
yaw = np.arctan2(-mean_forward[1], mean_forward[0])
return float((yaw + np.pi) % (2 * np.pi) - np.pi)
def _bev_edge_points(points_3d):
"""Return finite (x, z) BEV points for one decoded edge."""
pts = np.asarray(points_3d, dtype=np.float64)
if pts.ndim != 2 or pts.shape[0] < 2 or pts.shape[1] != 3:
return None
valid = np.isfinite(pts).all(axis=1)
pts = pts[valid]
if len(pts) < 2:
return None
return pts[:, [0, 2]]
def _fit_bev_edge_axis(points_3d):
"""Fit one dominant BEV line direction to decoded edge points."""
bev_points = _bev_edge_points(points_3d)
if bev_points is None:
return None, None
midpoint = np.mean(bev_points, axis=0)
centered = bev_points - midpoint
try:
_, _, vh = np.linalg.svd(centered, full_matrices=False)
except np.linalg.LinAlgError:
return None, None
axis = np.asarray(vh[0], dtype=np.float64)
norm = float(np.linalg.norm(axis))
if norm < 1e-8:
return None, None
return axis / norm, midpoint
def _estimate_two_edge_yaw_from_candidates(
longitudinal_candidate,
side_candidate,
reference_yaw=None,
):
"""Estimate yaw from two edges in BEV while keeping the box as parallel as possible to the side edge."""
if longitudinal_candidate is None or side_candidate is None:
return float("nan")
if int(longitudinal_candidate["face_type"]) not in (0, 1) or int(side_candidate["face_type"]) not in (2, 3):
return float("nan")
side_axis, side_midpoint = _fit_bev_edge_axis(side_candidate["points_3d"])
long_axis, long_midpoint = _fit_bev_edge_axis(longitudinal_candidate["points_3d"])
if side_axis is None or long_midpoint is None or side_midpoint is None:
return float("nan")
long_face_type = int(longitudinal_candidate["face_type"])
side_face_type = int(side_candidate["face_type"])
def _rot_ccw(v):
return np.array([-v[1], v[0]], dtype=np.float64)
def _face_normal(forward, face_type):
if face_type == 0:
return forward
if face_type == 1:
return -forward
if face_type == 2:
return _rot_ccw(forward)
return -_rot_ccw(forward)
forward_candidates = (side_axis, -side_axis)
best_forward = min(
forward_candidates,
key=lambda forward: float(np.dot(_face_normal(forward, long_face_type), long_midpoint))
+ float(np.dot(_face_normal(forward, side_face_type), side_midpoint)),
)
if reference_yaw is not None and np.isfinite(reference_yaw):
ref_forward = np.array([np.cos(float(reference_yaw)), -np.sin(float(reference_yaw))], dtype=np.float64)
if float(np.dot(best_forward, ref_forward)) < 0.0:
best_forward = -best_forward
yaw = np.arctan2(-best_forward[1], best_forward[0])
return float((yaw + np.pi) % (2 * np.pi) - np.pi)
def _resolve_two_face_candidate_roles(candidates, yaw):
"""Assign one decoded edge to the longitudinal face and the other to the side face from geometry."""
if candidates is None or len(candidates) < 2 or not np.isfinite(float(yaw)):
return None
forward_bev = np.array([np.cos(float(yaw)), -np.sin(float(yaw))], dtype=np.float64)
right_bev = np.array([np.sin(float(yaw)), np.cos(float(yaw))], dtype=np.float64)
role_candidates = []
for index, candidate in enumerate(candidates[:2]):
axis, midpoint = _fit_bev_edge_axis(candidate["points_3d"])
if axis is None or midpoint is None:
return None
role_candidates.append(
{
"index": int(index),
"candidate": candidate,
"axis": axis,
"midpoint": midpoint,
"forward_align": abs(float(np.dot(axis, forward_bev))),
"right_align": abs(float(np.dot(axis, right_bev))),
}
)
def _role_label_penalty(info, role):
face_type = int(info["candidate"].get("face_type", -1))
if role == "longitudinal":
return 0 if face_type in (0, 1) else 1
return 0 if face_type in (2, 3) else 1
assignments = ((0, 1), (1, 0))
best_assignment = min(
assignments,
key=lambda assignment: (
(1.0 - role_candidates[assignment[0]]["right_align"]) + (1.0 - role_candidates[assignment[1]]["forward_align"]),
_role_label_penalty(role_candidates[assignment[0]], "longitudinal")
+ _role_label_penalty(role_candidates[assignment[1]], "side"),
-(role_candidates[assignment[0]]["right_align"] + role_candidates[assignment[1]]["forward_align"]),
),
)
longitudinal_info = role_candidates[best_assignment[0]]
side_info = role_candidates[best_assignment[1]]
return {
"forward_bev": forward_bev,
"right_bev": right_bev,
"longitudinal": longitudinal_info,
"side": side_info,
}
def _resolve_two_face_center_from_geometry(longitudinal_info, side_info, length_m, width_m):
"""Recover the two-face box center from the pair of perpendicular visible edges."""
if longitudinal_info is None or side_info is None:
return None
forward_bev = np.asarray(longitudinal_info["forward_bev"], dtype=np.float64)
right_bev = np.asarray(longitudinal_info["right_bev"], dtype=np.float64)
long_mid = np.asarray(longitudinal_info["midpoint"], dtype=np.float64)
side_mid = np.asarray(side_info["midpoint"], dtype=np.float64)
if not np.isfinite(long_mid).all() or not np.isfinite(side_mid).all():
return None
raw_longitudinal_face_type = int(longitudinal_info["candidate"].get("face_type", -1))
if raw_longitudinal_face_type == 0:
longitudinal_options = ((1.0, 0),)
elif raw_longitudinal_face_type == 1:
longitudinal_options = ((-1.0, 1),)
else:
longitudinal_options = ((1.0, 0), (-1.0, 1))
best = None
for longitudinal_sign, longitudinal_face_type in longitudinal_options:
center_from_longitudinal = long_mid - longitudinal_sign * forward_bev * (float(length_m) * 0.5)
for side_sign, side_face_type in ((1.0, 2), (-1.0, 3)):
center_from_side = side_mid - side_sign * right_bev * (float(width_m) * 0.5)
disagreement = float(np.linalg.norm(center_from_longitudinal - center_from_side))
if best is None or disagreement < best["disagreement"]:
best = {
"center_from_longitudinal": center_from_longitudinal,
"center_from_side": center_from_side,
"longitudinal_face_type": int(longitudinal_face_type),
"side_face_type": int(side_face_type),
"disagreement": disagreement,
}
if best is None:
return None
longitudinal_coord = float(np.dot(best["center_from_longitudinal"], forward_bev))
lateral_coord = float(np.dot(best["center_from_side"], right_bev))
center_bev = longitudinal_coord * forward_bev + lateral_coord * right_bev
return {
"center_bev": center_bev,
"longitudinal_face_type": int(best["longitudinal_face_type"]),
"side_face_type": int(best["side_face_type"]),
"center_from_longitudinal": best["center_from_longitudinal"],
"center_from_side": best["center_from_side"],
}
def _estimate_single_edge_yaw_with_cut_primary_face(candidate, cut_state, reference_yaw=None):
"""Resolve single-edge yaw with cut-state longitudinal semantics when available."""
if candidate is None or cut_state not in (CUT_STATE_IN, CUT_STATE_OUT):
return float("nan")
face_type = int(candidate["face_type"])
if face_type in (0, 1):
yaw = edge_points_to_yaw(candidate["points_3d"], face_type)
if reference_yaw is not None and np.isfinite(reference_yaw):
return _align_yaw_to_reference_pi_periodic(yaw, reference_yaw)
primary_face = get_pred_cut_primary_face(cut_state)
if primary_face in (0, 1) and int(primary_face) != face_type:
return float((float(yaw) + 2 * np.pi) % (2 * np.pi) - np.pi)
return float(yaw)
if face_type not in (2, 3):
return float("nan")
axis, _ = _fit_bev_edge_axis(candidate["points_3d"])
midpoint = _bev_edge_points(candidate["points_3d"])
if axis is None or midpoint is None:
return float("nan")
midpoint = np.mean(midpoint, axis=0)
yaw_candidates = [float((np.arctan2(-forward[1], forward[0]) + np.pi) % (2 * np.pi) - np.pi) for forward in (axis, -axis)]
primary_face = get_pred_cut_primary_face(cut_state)
if primary_face in (0, 1):
matched = []
for yaw in yaw_candidates:
forward = np.array([np.cos(float(yaw)), -np.sin(float(yaw))], dtype=np.float64)
longitudinal_score = float(np.dot(forward, midpoint))
if (int(primary_face) == 0 and longitudinal_score > 0.0) or (int(primary_face) == 1 and longitudinal_score < 0.0):
matched.append(float(yaw))
candidates = matched or yaw_candidates
else:
candidates = yaw_candidates
yaw = float(candidates[0])
if reference_yaw is not None and np.isfinite(reference_yaw):
return _align_yaw_to_reference_pi_periodic(yaw, reference_yaw)
return yaw
def extract_face_regressed_size_priors_from_prediction(pred_41):
"""Extract per-face size regression hints from one denormalized 41-dim prediction."""
p = np.asarray(pred_41, dtype=np.float32).reshape(-1)
priors = {}
for face_type, off in enumerate(FACE_OFFSETS_41):
size_pair = np.asarray(p[off + 3 : off + 5], dtype=np.float32).reshape(-1)
if size_pair.shape != (2,) or not np.isfinite(size_pair).all():
continue
if face_type in (0, 1):
priors[int(face_type)] = {
"height": float(abs(size_pair[0])),
"width": float(abs(size_pair[1])),
}
else:
priors[int(face_type)] = {
"length": float(abs(size_pair[0])),
"height": float(abs(size_pair[1])),
}
return priors
def _select_edge_or_regressed_size(measured_size_m, regressed_size_m, min_fraction=0.85, max_fraction=1.35):
"""Use edge-measured size when it is geometrically sane, otherwise fall back to regression."""
regressed = float(abs(regressed_size_m))
if not np.isfinite(regressed) or regressed <= 1e-6:
return None, None
measured = None if measured_size_m is None else float(abs(measured_size_m))
if measured is None or not np.isfinite(measured) or measured <= 1e-6:
return regressed, "regressed"
fraction = measured / regressed
if fraction < float(min_fraction) or fraction > float(max_fraction):
return regressed, "regressed"
return measured, "edge"
def reconstruct_edge_based_box_from_selection(edge_selection, box_center_y_m, regressed_dims, face_regressed_dims_by_type=None):
"""Reconstruct a full 3D box from one or two selected visible-face bottom edges.
Two-face mode:
- side edge provides yaw/length and lateral anchor
- front/rear edge provides width and longitudinal anchor
One-face mode:
- front/rear edge provides yaw/width and the visible-face longitudinal+lateral anchor
- side edge provides yaw/length and the visible-face longitudinal+lateral anchor
The selected edge geometry stays the anchor. Height and the missing dimensions in one-face mode
come from the regressed branch.
"""
if edge_selection is None:
return None
yaw = float(edge_selection.get("yaw", float("nan")))
if not np.isfinite(yaw):
return None
dims_reg = np.asarray(regressed_dims, dtype=np.float32).reshape(-1)
if dims_reg.shape != (3,) or not np.isfinite(dims_reg).all():
return None
reg_length = float(abs(dims_reg[0]))
box_height = float(abs(dims_reg[1]))
reg_width = float(abs(dims_reg[2]))
if reg_length <= 1e-6 or box_height <= 1e-6 or reg_width <= 1e-6:
return None
face_types = tuple(int(face_type) for face_type in (edge_selection.get("face_types") or ()))
edge_batches = _edge_batches_to_list(edge_selection.get("edge_points_3d"))
if len(face_types) != len(edge_batches):
return None
face_is_partial = tuple(bool(flag) for flag in (edge_selection.get("face_is_partial") or ()))
if len(face_is_partial) < len(face_types):
face_is_partial = face_is_partial + (False,) * (len(face_types) - len(face_is_partial))
candidates = []
for face_type, points_3d, is_partial in zip(face_types, edge_batches, face_is_partial):
pts = np.asarray(points_3d, dtype=np.float32)
if pts.ndim != 2 or pts.shape[0] < 2 or pts.shape[1] != 3 or not np.isfinite(pts).all():
return None
candidates.append({"face_type": int(face_type), "points_3d": pts, "is_partial": bool(is_partial)})
forward = np.array([np.cos(yaw), 0.0, -np.sin(yaw)], dtype=np.float64)
right = np.array([np.sin(yaw), 0.0, np.cos(yaw)], dtype=np.float64)
center_x = None
center_z = None
length_m = None
width_m = None
length_source = None
width_source = None
mode = None
resolved_face_types = list(face_types)
resolved_longitudinal_face_type = None
resolved_side_face_type = None
face_regressed_dims_by_type = face_regressed_dims_by_type or {}
def _face_size_prior(candidate, key, fallback, max_ratio=1.25):
if candidate is None or bool(candidate.get("is_partial")):
return float(fallback)
prior = face_regressed_dims_by_type.get(int(candidate["face_type"]), {})
value = prior.get(key)
if value is None or not np.isfinite(float(value)) or float(value) <= 1e-6:
return float(fallback)
prior_value = float(value)
fallback_value = float(abs(fallback))
if fallback_value <= 1e-6:
return prior_value
ratio = max(prior_value / fallback_value, fallback_value / prior_value)
if ratio > float(max_ratio):
return fallback_value
return prior_value
role_resolution = _resolve_two_face_candidate_roles(candidates, yaw) if len(candidates) >= 2 else None
if role_resolution is not None:
longitudinal_info = {
**role_resolution["longitudinal"],
"forward_bev": role_resolution["forward_bev"],
"right_bev": role_resolution["right_bev"],
}
side_info = {
**role_resolution["side"],
"forward_bev": role_resolution["forward_bev"],
"right_bev": role_resolution["right_bev"],
}
longitudinal_candidate = longitudinal_info["candidate"]
side_candidate = side_info["candidate"]
side_length_m = None if bool(side_candidate.get("is_partial")) else _edge_segment_length_3d(side_candidate["points_3d"])
width_from_long_m = (
None if bool(longitudinal_candidate.get("is_partial")) else _edge_segment_length_3d(longitudinal_candidate["points_3d"])
)
length_m, length_source = _select_edge_or_regressed_size(
side_length_m,
_face_size_prior(side_candidate, "length", reg_length),
)
width_m, width_source = _select_edge_or_regressed_size(
width_from_long_m,
_face_size_prior(longitudinal_candidate, "width", reg_width),
)
if length_m is None or width_m is None:
return None
center_resolution = _resolve_two_face_center_from_geometry(longitudinal_info, side_info, length_m, width_m)
if center_resolution is None:
return None
center_bev = np.asarray(center_resolution["center_bev"], dtype=np.float64)
if center_bev.shape != (2,) or not np.isfinite(center_bev).all():
return None
center_x = float(center_bev[0])
center_z = float(center_bev[1])
resolved_longitudinal_face_type = int(center_resolution["longitudinal_face_type"])
resolved_side_face_type = int(center_resolution["side_face_type"])
resolved_face_types[int(longitudinal_info["index"])] = resolved_longitudinal_face_type
resolved_face_types[int(side_info["index"])] = resolved_side_face_type
mode = "two-face"
else:
longitudinal_candidate = next((candidate for candidate in candidates if candidate["face_type"] in (0, 1)), None)
side_candidate = next((candidate for candidate in candidates if candidate["face_type"] in (2, 3)), None)
if mode == "two-face":
pass
elif longitudinal_candidate is not None:
long_mid = np.mean(np.asarray(longitudinal_candidate["points_3d"], dtype=np.float64), axis=0)
width_from_long_m = (
None if bool(longitudinal_candidate.get("is_partial")) else _edge_segment_length_3d(longitudinal_candidate["points_3d"])
)
if not np.isfinite(long_mid).all():
return None
width_m, width_source = _select_edge_or_regressed_size(
width_from_long_m,
_face_size_prior(longitudinal_candidate, "width", reg_width),
)
if width_m is None:
return None
longitudinal_sign = 1.0 if int(longitudinal_candidate["face_type"]) == 0 else -1.0
center_from_longitudinal = long_mid - longitudinal_sign * forward * (float(reg_length) * 0.5)
center_x = float(center_from_longitudinal[0])
center_z = float(center_from_longitudinal[2])
length_m = float(reg_length)
width_source = width_source or "regressed"
length_source = "regressed"
resolved_longitudinal_face_type = int(longitudinal_candidate["face_type"])
mode = "front-rear"
elif side_candidate is not None:
side_mid = np.mean(np.asarray(side_candidate["points_3d"], dtype=np.float64), axis=0)
side_length_m = None if bool(side_candidate.get("is_partial")) else _edge_segment_length_3d(side_candidate["points_3d"])
if not np.isfinite(side_mid).all():
return None
length_m, length_source = _select_edge_or_regressed_size(
side_length_m,
_face_size_prior(side_candidate, "length", reg_length),
)
if length_m is None:
return None
side_sign = 1.0 if int(side_candidate["face_type"]) == 2 else -1.0
center_from_side = side_mid - side_sign * right * (float(reg_width) * 0.5)
center_x = float(center_from_side[0])
center_z = float(center_from_side[2])
width_m = float(reg_width)
width_source = "regressed"
resolved_side_face_type = int(side_candidate["face_type"])
mode = "side"
else:
return None
all_y = np.concatenate([candidate["points_3d"][:, 1] for candidate in candidates], axis=0)
if all_y.size == 0 or not np.isfinite(all_y).all():
if box_center_y_m is None or not np.isfinite(float(box_center_y_m)):
return None
center_y = float(box_center_y_m)
else:
center_y = float(np.mean(all_y) - box_height * 0.5)
center = np.array(
[
float(center_x),
float(center_y),
float(center_z),
],
dtype=np.float32,
)
if not np.isfinite(center).all():
return None
dims = np.array([float(length_m), float(box_height), float(width_m)], dtype=np.float32)
corners_3d = compute_3d_box_corners(center, dims, float(yaw), face_type=-1)
return {
"center": center,
"dims": dims,
"yaw": float(yaw),
"corners_3d": corners_3d.astype(np.float32),
"mode": mode,
"side_length_m": float(length_m),
"width_m": float(width_m),
"length_source": length_source,
"width_source": width_source,
"face_types": tuple(int(face_type) for face_type in resolved_face_types),
"longitudinal_face_type": resolved_longitudinal_face_type,
"side_face_type": resolved_side_face_type,
}
def reconstruct_two_face_box_from_edge_selection(edge_selection, box_height_m):
"""Backward-compatible two-face-only wrapper around the generalized edge-based reconstruction."""
edge_box = reconstruct_edge_based_box_from_selection(
edge_selection,
box_center_y_m=None,
regressed_dims=np.array([1.0, float(box_height_m), 1.0], dtype=np.float32),
)
if edge_box is None or edge_box.get("mode") != "two-face":
return None
return edge_box
def classify_edge_yaw_prediction_bucket(face_types, is_valid):
"""Bucket one prediction by whether edge-yaw would be used from prediction-side cues only."""
face_types = tuple(int(face_type) for face_type in (face_types or ()))
has_longitudinal = any(face_type in (0, 1) for face_type in face_types)
has_side = any(face_type in (2, 3) for face_type in face_types)
if bool(is_valid) and has_longitudinal and has_side:
return "two-face"
if has_side and not has_longitudinal:
return "side only"
if has_longitudinal:
return "front_rear_only"
return None
def _align_yaw_to_reference_pi_periodic(yaw, reference_yaw):
"""Choose the pi-equivalent yaw closest to a reference heading."""
if not np.isfinite(yaw) or not np.isfinite(reference_yaw):
return float(yaw)
base = float((float(yaw) + np.pi) % (2 * np.pi) - np.pi)
alt = float((float(yaw) + 2 * np.pi) % (2 * np.pi) - np.pi)
return min(
(base, alt),
key=lambda candidate: abs(float((candidate - float(reference_yaw) + np.pi) % (2 * np.pi) - np.pi)),
)
def _draw_edge_points(img, edge_points_2d=None, edge_color=(0, 255, 0), thickness=1):
"""Draw sampled bottom-edge points and the connecting polylines."""
if edge_points_2d is None:
return img
pts = np.asarray(edge_points_2d, dtype=np.float32)
if pts.size == 0 or np.any(np.isnan(pts)):
return img
if pts.ndim == 2:
pts = pts[None, ...]
if pts.ndim != 3 or pts.shape[1] == 0:
return img
radius = max(1, thickness + 1)
for poly in pts:
pts_i = np.round(poly).astype(np.int32)
cv2.polylines(img, [pts_i], isClosed=False, color=edge_color, thickness=thickness, lineType=cv2.LINE_AA)
for pt in pts_i:
cv2.circle(img, tuple(pt), radius, edge_color, -1, cv2.LINE_AA)
return img
def decode_3d_target(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
bbox_xyxy=None,
):
"""Decode a single 42-dim GT label to 3D box corners for visualization."""
t = target_42
if np.isnan(t[2]) or t[2] <= 0:
return None
depth_scale = calib.get("depth_scale", 1.0) if calib else 1.0
dims = t[3:6]
rot_y = t[6]
if cls_id in face_3d_classes:
best_type, best_score, best_data = -1, -1.0, None
visible_faces = []
for ft, off in enumerate(FACE_OFFSETS_42):
face = t[off : off + 8]
is_vis, score = face[7], face[6]
if is_vis != 1 or np.isnan(score) or score < score_thr:
continue
z_f = face[2]
if np.isnan(z_f) or z_f <= 0:
continue
visible_faces.append(ft)
if score > best_score:
best_score, best_type, best_data = float(score), ft, face
if best_type < 0:
return None
u = best_data[4] * img_w
v = best_data[5] * img_h
z_f = best_data[2] * depth_scale
corners = reconstruct_3d_box_from_face((u, v), z_f, dims, rot_y, best_type, calib)
if corners is None:
return None
edge_points_3d, edge_points_2d = collect_face_bottom_edges(corners, visible_faces, calib, num_samples=5)
partial_edge = decode_cut_partial_side_edge_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
bbox_xyxy=bbox_xyxy,
corners_3d=corners,
score_thr=score_thr,
)
if partial_edge is not None:
edge_points_3d, edge_points_2d = _append_edge_batch(edge_points_3d, edge_points_2d, partial_edge)
visible_faces = list(dict.fromkeys([*visible_faces, partial_edge["face_type"]]))
return {
"corners_3d": corners,
"face_center_2d": (u, v),
"face_color": FACE_COLORS[best_type],
"visible_face_type": best_type,
"visible_face_types": tuple(visible_faces),
"edge_points_2d": edge_points_2d,
"edge_points_3d": edge_points_3d,
"cls": cls_id,
}
if cls_id in complete_3d_classes:
u = t[7] * img_w
v = t[8] * img_h
z = t[2] * depth_scale
corners = reconstruct_3d_box_from_whole((u, v), z, dims, rot_y, calib)
if corners is None:
return None
return {
"corners_3d": corners,
"face_center_2d": None,
"face_color": None,
"visible_face_type": None,
"visible_face_types": (),
"edge_points_2d": None,
"edge_points_3d": None,
"cls": cls_id,
}
return None
def decode_3d_prediction(
pred_41,
anchor_xy,
stride,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
cls_id,
pred_edge_60=None,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
bbox_xyxy=None,
):
"""Decode a single 41-dim denormalized prediction to 3D box corners."""
p = pred_41
rot_y = _decode_yaw_from_prediction(p)
z_whole = p[24]
uv_whole_offset = p[25:27]
dims_whole = p[27:30]
u_whole = (anchor_xy[0] + uv_whole_offset[0]) * stride
v_whole = (anchor_xy[1] + uv_whole_offset[1]) * stride
if cls_id in face_3d_classes:
_, cut_side = _resolve_pred_cut_state_for_decode(p, bbox_xyxy=bbox_xyxy, img_w=img_w)
visible_faces = select_pred_visible_faces_for_decode(p, score_thr=score_thr, bbox_xyxy=bbox_xyxy, img_w=img_w)
anchor_face = select_best_score_pred_face_anchor(
p,
anchor_xy,
stride,
calib,
visible_faces,
)
if anchor_face is None:
return None
anchor_face_type = int(anchor_face["face_type"])
anchor_face_center_3d = np.asarray(anchor_face["center_3d"], dtype=np.float32)
if anchor_face_center_3d.shape != (3,) or not np.isfinite(anchor_face_center_3d).all():
return None
corners = compute_3d_box_corners(anchor_face_center_3d, dims_whole, rot_y, anchor_face_type)
edge_points_3d, edge_points_2d = collect_face_bottom_edges(
corners, [face_type for face_type, _ in visible_faces], calib, num_samples=5
)
if pred_edge_60 is not None:
pred_edge_points_2d, pred_edge_points_3d = [], []
for face_type, _ in visible_faces:
pred_edge = decode_visible_face_edge_from_prediction(pred_edge_60, face_type, anchor_xy, stride)
if pred_edge is None:
continue
points_3d = [
back_project_2d_to_3d(tuple(pt), depth, calib) for pt, depth in zip(pred_edge["points_2d"], pred_edge["depths"])
]
if any(point is None for point in points_3d):
continue
pred_edge_points_2d.append(pred_edge["points_2d"].astype(np.float32, copy=False))
pred_edge_points_3d.append(np.asarray(points_3d, dtype=np.float32))
if pred_edge_points_2d:
edge_points_2d = _stack_edge_batches(pred_edge_points_2d)
edge_points_3d = _stack_edge_batches(pred_edge_points_3d)
partial_edge = decode_cut_partial_side_edge_from_prediction(
p,
pred_edge_60,
anchor_xy,
stride,
img_w,
cut_side=cut_side,
corners_3d=corners,
)
if partial_edge is not None:
partial_points_3d = [
back_project_2d_to_3d(tuple(pt), depth, calib)
for pt, depth in zip(partial_edge["points_2d"], partial_edge["depths"])
]
if all(point is not None for point in partial_points_3d):
partial_edge = {**partial_edge, "points_3d": np.asarray(partial_points_3d, dtype=np.float32)}
visible_face_types = {face_type for face_type, _ in visible_faces}
if partial_edge["face_type"] not in visible_face_types:
edge_points_3d, edge_points_2d = _append_edge_batch(edge_points_3d, edge_points_2d, partial_edge)
visible_faces = [*visible_faces, (partial_edge["face_type"], 1.0)]
return {
"corners_3d": corners,
"face_center_2d": tuple(np.asarray(anchor_face["center_2d"], dtype=np.float32).tolist()),
"face_color": FACE_COLORS[anchor_face_type],
"visible_face_type": anchor_face_type,
"visible_face_types": tuple(face_type for face_type, _ in visible_faces),
"edge_points_2d": edge_points_2d,
"edge_points_3d": edge_points_3d,
"cls": cls_id,
}
if cls_id in complete_3d_classes:
corners = reconstruct_3d_box_from_whole((u_whole, v_whole), z_whole, dims_whole, rot_y, calib)
if corners is None:
return None
return {
"corners_3d": corners,
"face_center_2d": None,
"face_color": None,
"visible_face_type": None,
"visible_face_types": (),
"edge_points_2d": None,
"edge_points_3d": None,
"cls": cls_id,
}
return None
def draw_3d_box(
img,
corners_3d,
calib,
face_center_2d=None,
face_color=None,
edge_points_2d=None,
edge_color=(0, 255, 0),
thickness=1,
):
"""Project and draw a 3D box wireframe on an image."""
corners_3d = corners_3d[[4, 5, 6, 7, 0, 1, 2, 3]]
color_front = (0, 0, 255)
color_back = (255, 0, 0)
color_side = (255, 255, 0)
distort_coeffs = calib.get("distort_coeffs", []) if calib is not None else []
if distort_coeffs is not None and len(distort_coeffs) >= 4:
edge_points_2d_box = project_3d_box_edges_with_distortion(corners_3d, calib, samples_per_edge=15)
plot_box3d_on_img_with_distortion(
img, edge_points_2d_box, color_front=color_front, color_back=color_back, color_side=color_side, thickness=thickness
)
else:
corners_2d = project_3d_to_2d(corners_3d, calib)
if np.any(np.isnan(corners_2d)):
return img
plot_box3d_on_img(
img, corners_2d, color_front=color_front, color_back=color_back, color_side=color_side, thickness=thickness
)
if face_center_2d is not None and face_color is not None:
cv2.circle(img, (int(face_center_2d[0]), int(face_center_2d[1])), 2, face_color, -1, cv2.LINE_AA)
_draw_edge_points(img, edge_points_2d=edge_points_2d, edge_color=edge_color, thickness=thickness)
return img
def plot_3d_boxes_on_image(img_tensor, decoded_results, calib=None, label_text=None, scale_factor=2):
"""Draw decoded 3D boxes on an image tensor.
Args:
img_tensor: (3, H, W) or (N, 3, H, W) tensor normalized [0, 1] BGR.
decoded_results: List of dicts from decode_3d_target/decode_3d_prediction.
calib: Dict with fx, fy, cx, cy.
label_text: Optional text overlay (e.g., "3D GT" or "3D Pred").
scale_factor: Upscale factor for clearer visualization.
Returns:
(H*scale, W*scale, 3) RGB numpy image, or None if no boxes.
"""
if img_tensor.ndim == 4:
img_tensor = img_tensor[0]
im = img_tensor.cpu().numpy().transpose(1, 2, 0)
im = np.ascontiguousarray(im * 255, dtype=np.uint8)
h, w = im.shape[:2]
h_new, w_new = h * scale_factor, w * scale_factor
im = cv2.resize(im, (w_new, h_new), interpolation=cv2.INTER_LINEAR)
# Scale calibration
if calib is not None:
calib_s = {
"fx": calib["fx"] * scale_factor,
"fy": calib["fy"] * scale_factor,
"cx": calib["cx"] * scale_factor,
"cy": calib["cy"] * scale_factor,
"distort_coeffs": calib.get("distort_coeffs", []),
"depth_scale": calib.get("depth_scale", 1.0),
}
else:
calib_s = {"fx": w_new * 1.2, "fy": w_new * 1.2, "cx": w_new / 2, "cy": h_new / 2, "distort_coeffs": []}
for d in decoded_results:
if d is None or d.get("corners_3d") is None:
continue
fc = d.get("face_center_2d")
if fc is not None:
fc = (fc[0] * scale_factor, fc[1] * scale_factor)
edge_points_2d = d.get("edge_points_2d")
if edge_points_2d is not None:
edge_points_2d = np.asarray(edge_points_2d, dtype=np.float32) * scale_factor
draw_3d_box(
im,
d["corners_3d"],
calib_s,
fc,
d.get("face_color"),
edge_points_2d=edge_points_2d,
thickness=max(1, scale_factor),
)
if label_text:
cv2.putText(im, label_text, (10, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 255, 255), 3, cv2.LINE_AA)
return cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
def decode_3d_prediction_batch(preds_3d_sel, anchors, strides, cls_ids, calib, img_w, img_h,
face_3d_classes, complete_3d_classes):
"""Batch decode multiple 3D predictions for visualization.
Args:
preds_3d_sel: (k, 41) numpy array — denormalized 3D predictions.
anchors: (2, k) numpy array — anchor xy in grid coords.
strides: (k,) numpy array — stride per anchor.
cls_ids: (k,) numpy array — class IDs.
calib: Dict with fx, fy, cx, cy.
img_w: Image width in pixels.
img_h: Image height in pixels.
face_3d_classes: Set of class IDs with face annotations.
complete_3d_classes: Set of class IDs with whole-box 3D only.
Returns:
List of decoded dicts (same format as decode_3d_prediction).
"""
results = []
for i in range(len(preds_3d_sel)):
anchor_xy = anchors[:, i]
d = decode_3d_prediction(
preds_3d_sel[i], anchor_xy, float(strides[i]),
calib, img_w, img_h, face_3d_classes, complete_3d_classes, int(cls_ids[i])
)
results.append(d)
return results
def decode_pred_face_anchor(pred_41, anchor_xy, stride, calib, face_type):
"""Decode one predicted face center for use as a visualization anchor."""
if face_type not in range(4):
return None
off = FACE_OFFSETS_41[int(face_type)]
z_face = float(pred_41[off])
uv_face_offset = np.asarray(pred_41[off + 1 : off + 3], dtype=np.float32)
if not np.isfinite(z_face) or z_face <= 0 or not np.isfinite(uv_face_offset).all():
return None
u_face = float((anchor_xy[0] + uv_face_offset[0]) * stride)
v_face = float((anchor_xy[1] + uv_face_offset[1]) * stride)
center_3d = back_project_2d_to_3d((u_face, v_face), z_face, calib)
if center_3d is None:
return None
center_arr = np.asarray(center_3d, dtype=np.float32)
if center_arr.shape != (3,) or not np.isfinite(center_arr).all():
return None
return {
"face_type": int(face_type),
"center_3d": center_arr,
"center_2d": np.array([u_face, v_face], dtype=np.float32),
}
def select_best_score_pred_face_anchor(
pred_41,
anchor_xy,
stride,
calib,
visible_faces,
):
"""Select the predicted face anchor using the highest visible-face score."""
if not visible_faces:
return None
best_face_type, _ = max(((int(face_type), float(score)) for face_type, score in visible_faces if int(face_type) in range(4)), key=lambda item: item[1], default=(-1, float("-inf")))
if best_face_type not in range(4):
return None
return decode_pred_face_anchor(pred_41, anchor_xy, stride, calib, best_face_type)
def _decode_yaw_from_prediction(pred_41):
"""Decode whole-box yaw from a 41-dim denormalized prediction."""
yaw_cls_logits = pred_41[30:34]
yaw_residual_sin = np.clip(pred_41[34:38], -1.0, 1.0)
best_bin = int(np.argmax(yaw_cls_logits))
return np.arcsin(yaw_residual_sin[best_bin]) + YAW_BIN_OFFSETS[best_bin]
def decode_visible_face_yaw_from_prediction(pred_41, pred_edge_60, anchor_xy, stride, face_type, calib):
"""Decode auxiliary visible-face yaw from sampled bottom-edge predictions."""
if pred_edge_60 is None or face_type not in range(4):
return float("nan")
decoded = decode_visible_face_edge_from_prediction(pred_edge_60, face_type, anchor_xy, stride)
points_3d = _decoded_edge_to_points_3d(decoded, calib)
if points_3d is None:
return float("nan")
return edge_points_to_yaw(points_3d, face_type)
def decode_visible_face_yaw_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
face_type,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
bbox_xyxy=None,
):
"""Decode GT visible-face yaw from sampled bottom-edge geometry."""
decoded = decode_visible_face_edge_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
face_type=face_type,
score_thr=score_thr,
bbox_xyxy=bbox_xyxy,
)
if decoded is None:
return float("nan")
return edge_points_to_yaw(decoded["points_3d"], decoded["face_type"])
def decode_edge_yaw_selection_from_prediction(
pred_41,
pred_edge_60,
anchor_xy,
stride,
calib,
score_thr=EDGE_YAW_VALID_VISIBILITY_SCORE_THRESH,
bbox_xyxy=None,
img_w=None,
img_h=None,
max_lateral_dist_m=None,
cut_side_min_visible_length_ratio=EDGE_YAW_CUT_SIDE_MIN_VISIBLE_LENGTH_RATIO,
max_faces=2,
):
"""Select the face-edge geometry used for prediction-time edge-yaw re-estimation.
The selection intentionally uses a face-based primary face plus an optional strict two-face companion:
- choose the first face exactly as face-based reconstruction would choose its visible-face anchor
- then choose at most one companion face from the opposite face family using the stricter threshold
- for cut states, the cut classification chooses the longitudinal face first
- for true border-cut objects, prefer the decoded partial side edge over a full side edge
"""
empty = {
"yaw": float("nan"),
"face_types": (),
"face_is_partial": (),
"edge_points_2d": None,
"edge_points_3d": None,
"two_face_eligible": False,
"lateral_distance_m": None,
"lateral_ok": False if max_lateral_dist_m is not None else True,
"cut_side_visible_length_m": None,
"cut_side_visible_length_ratio": None,
"cut_side_visible_ratio_ok": None,
"is_valid": False,
}
if pred_edge_60 is None:
return empty
inferred_img_w = float(img_w) if img_w is not None else None
inferred_img_h = float(img_h) if img_h is not None else None
if inferred_img_w is None:
if bbox_xyxy is not None:
inferred_img_w = max(float(np.asarray(bbox_xyxy, dtype=np.float64)[2]), 1.0)
else:
inferred_img_w = max(float((anchor_xy[0] + pred_41[25]) * stride) * 2.0, 1.0)
decode_visible_faces = list(
select_pred_visible_faces_for_decode(
pred_41,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
bbox_xyxy=bbox_xyxy,
img_w=inferred_img_w,
)
)
anchor_face = select_best_score_pred_face_anchor(pred_41, anchor_xy, stride, calib, decode_visible_faces)
lateral_distance_m = None if anchor_face is None else _prediction_lateral_distance_m_from_center(anchor_face.get("center_3d"))
lateral_ok = bool(
max_lateral_dist_m is None or (lateral_distance_m is not None and lateral_distance_m < float(max_lateral_dist_m))
)
primary_candidate_face_type = max(
((int(face_type), float(score)) for face_type, score in decode_visible_faces if int(face_type) in range(4)),
key=lambda item: item[1],
default=(-1, float("-inf")),
)[0]
raw_cut_state = get_pred_cut_state(pred_41)
primary_face = get_pred_cut_primary_face(raw_cut_state)
visible_faces = list(select_pred_visible_faces(pred_41, score_thr=score_thr))
longitudinal_faces = {face_type for face_type, _ in visible_faces if face_type in (0, 1)}
if primary_face in longitudinal_faces and len(longitudinal_faces) > 1:
visible_faces = [(face_type, score) for face_type, score in visible_faces if face_type not in (0, 1) or face_type == primary_face]
def _decode_face_candidate(face_type, score, require_in_image=True):
if face_type not in range(4):
return None
decoded = decode_visible_face_edge_from_prediction(pred_edge_60, face_type, anchor_xy, stride)
if decoded is None:
return None
if require_in_image:
drawable = _decoded_edge_points_are_drawable(decoded["points_2d"], inferred_img_w, inferred_img_h)
else:
# The primary edge should follow face-based anchor selection even when one sample lands just
# outside the image. Companions stay fully in-image so the strict two-face case remains stable.
drawable = _decoded_edge_points_are_drawable(decoded["points_2d"])
if not drawable:
return None
points_3d = _decoded_edge_to_points_3d(decoded, calib)
if points_3d is None:
return None
return {
"face_type": int(face_type),
"score": float(score),
"is_partial": False,
"points_2d": np.asarray(decoded["points_2d"], dtype=np.float32),
"points_3d": np.asarray(points_3d, dtype=np.float32),
}
face_candidates = {}
for face_type, score in visible_faces:
candidate = _decode_face_candidate(face_type, score)
if candidate is not None:
face_candidates[int(face_type)] = candidate
primary_candidate = None
if primary_candidate_face_type in range(4):
primary_score = next(
(float(score) for face_type, score in decode_visible_faces if int(face_type) == int(primary_candidate_face_type)),
float("-inf"),
)
primary_candidate = _decode_face_candidate(
int(primary_candidate_face_type),
primary_score,
require_in_image=False,
)
if primary_candidate is not None:
face_candidates.pop(int(primary_candidate_face_type), None)
resolved_cut_state, cut_side = _resolve_pred_cut_state_for_decode(pred_41, bbox_xyxy=bbox_xyxy, img_w=inferred_img_w)
partial_candidate = None
cut_side_visible_length_m = None
cut_side_visible_length_ratio = None
cut_side_visible_ratio_ok = None
if resolved_cut_state != CUT_STATE_NORMAL:
cut_corners = _reconstruct_pred_corners_for_cut_edge(pred_41, anchor_xy, stride, calib, cut_state=resolved_cut_state)
partial_edge = decode_cut_partial_side_edge_from_prediction(
pred_41,
pred_edge_60,
anchor_xy,
stride,
img_w=inferred_img_w,
cut_side=cut_side,
corners_3d=cut_corners,
)
if partial_edge is not None and not _decoded_edge_points_are_drawable(
partial_edge["points_2d"], inferred_img_w, inferred_img_h
):
partial_edge = None
partial_points_3d = _decoded_edge_to_points_3d(partial_edge, calib)
cut_side_visible_length_m = _edge_segment_length_3d(partial_points_3d)
box_length_m = float(abs(pred_41[27])) if np.isfinite(pred_41[27]) else None
if cut_side_visible_length_m is not None and box_length_m is not None and box_length_m > 1e-6:
cut_side_visible_length_ratio = float(cut_side_visible_length_m / box_length_m)
cut_side_visible_ratio_ok = bool(cut_side_visible_length_ratio > float(cut_side_min_visible_length_ratio))
else:
cut_side_visible_ratio_ok = False
if partial_edge is not None and partial_points_3d is not None:
partial_face_type = int(partial_edge["face_type"])
partial_score = face_candidates.get(partial_face_type, {}).get("score", 1.0)
partial_candidate = {
"face_type": partial_face_type,
"score": float(partial_score),
"is_partial": True,
"points_2d": np.asarray(partial_edge["points_2d"], dtype=np.float32),
"points_3d": np.asarray(partial_points_3d, dtype=np.float32),
}
face_candidates.pop(partial_face_type, None)
if resolved_cut_state != CUT_STATE_NORMAL and not cut_side_visible_ratio_ok:
partial_candidate = None
selected_candidates = []
def _best_candidate(candidates):
if not candidates:
return None
return max(candidates, key=lambda item: (float(item["score"]), -int(item["face_type"])))
cut_expected_side_face = None
if raw_cut_state != CUT_STATE_NORMAL:
cut_corners_for_side = (
cut_corners
if resolved_cut_state != CUT_STATE_NORMAL and cut_corners is not None
else _reconstruct_pred_corners_for_cut_edge(pred_41, anchor_xy, stride, calib, cut_state=raw_cut_state)
)
cut_expected_side_face = get_cut_object_side_face(raw_cut_state, corners_3d=cut_corners_for_side)
if primary_candidate is not None:
selected_candidates.append(primary_candidate)
if len(selected_candidates) < int(max_faces):
secondary_candidate = None
if primary_candidate is not None and int(primary_candidate["face_type"]) in (0, 1):
secondary_candidate = partial_candidate
if secondary_candidate is None and cut_expected_side_face in (2, 3) and (resolved_cut_state == CUT_STATE_NORMAL or cut_side_visible_ratio_ok):
secondary_candidate = face_candidates.pop(int(cut_expected_side_face), None)
if secondary_candidate is None and (resolved_cut_state == CUT_STATE_NORMAL or cut_side_visible_ratio_ok):
secondary_candidate = _best_candidate([candidate for candidate in face_candidates.values() if candidate["face_type"] in (2, 3)])
if secondary_candidate is not None:
face_candidates.pop(int(secondary_candidate["face_type"]), None)
elif primary_candidate is not None and int(primary_candidate["face_type"]) in (2, 3):
longitudinal_candidate = None
if primary_face is not None:
longitudinal_candidate = face_candidates.pop(int(primary_face), None)
if longitudinal_candidate is None:
longitudinal_candidate = _best_candidate([candidate for candidate in face_candidates.values() if candidate["face_type"] in (0, 1)])
if longitudinal_candidate is not None:
face_candidates.pop(int(longitudinal_candidate["face_type"]), None)
secondary_candidate = longitudinal_candidate
if secondary_candidate is not None:
selected_candidates.append(secondary_candidate)
if not selected_candidates:
return {
**empty,
"cut_side_visible_length_m": cut_side_visible_length_m,
"cut_side_visible_length_ratio": cut_side_visible_length_ratio,
"cut_side_visible_ratio_ok": cut_side_visible_ratio_ok,
}
edge_points_3d = _stack_edge_batches([candidate["points_3d"] for candidate in selected_candidates])
edge_points_2d = _stack_edge_batches([candidate["points_2d"] for candidate in selected_candidates])
face_types = tuple(int(candidate["face_type"]) for candidate in selected_candidates)
face_is_partial = tuple(bool(candidate.get("is_partial", False)) for candidate in selected_candidates)
if len(selected_candidates) >= 2:
longitudinal_selected = next((candidate for candidate in selected_candidates if candidate["face_type"] in (0, 1)), None)
side_selected = next((candidate for candidate in selected_candidates if candidate["face_type"] in (2, 3)), None)
yaw = _estimate_two_edge_yaw_from_candidates(
longitudinal_selected,
side_selected,
reference_yaw=_decode_yaw_from_prediction(pred_41),
)
if not np.isfinite(yaw):
yaw = visible_face_edges_to_yaw(
{candidate["face_type"]: candidate["points_3d"] for candidate in selected_candidates},
face_scores={candidate["face_type"]: candidate["score"] for candidate in selected_candidates},
)
else:
only_candidate = selected_candidates[0]
if raw_cut_state in (CUT_STATE_IN, CUT_STATE_OUT):
yaw = _estimate_single_edge_yaw_with_cut_primary_face(
only_candidate,
cut_state=raw_cut_state,
reference_yaw=_decode_yaw_from_prediction(pred_41),
)
if not np.isfinite(yaw):
yaw = edge_points_to_yaw(only_candidate["points_3d"], only_candidate["face_type"])
else:
yaw = edge_points_to_yaw(only_candidate["points_3d"], only_candidate["face_type"])
has_longitudinal = any(candidate["face_type"] in (0, 1) for candidate in selected_candidates)
has_side = any(candidate["face_type"] in (2, 3) for candidate in selected_candidates)
two_face_eligible = len(selected_candidates) >= 2 and has_longitudinal and has_side
is_valid = bool(two_face_eligible and np.isfinite(yaw) and lateral_ok)
return {
"yaw": float(yaw),
"face_types": face_types,
"face_is_partial": face_is_partial,
"edge_points_2d": edge_points_2d,
"edge_points_3d": edge_points_3d,
"two_face_eligible": bool(two_face_eligible),
"lateral_distance_m": lateral_distance_m,
"lateral_ok": lateral_ok,
"cut_side_visible_length_m": cut_side_visible_length_m,
"cut_side_visible_length_ratio": cut_side_visible_length_ratio,
"cut_side_visible_ratio_ok": cut_side_visible_ratio_ok,
"is_valid": bool(is_valid),
}
def decode_multi_visible_face_yaw_from_prediction(
pred_41,
pred_edge_60,
anchor_xy,
stride,
calib,
fallback_face_type=None,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
bbox_xyxy=None,
img_w=None,
):
"""Decode visible-face yaw using the same direct two-edge logic as prediction-time edge-yaw selection."""
if pred_edge_60 is None:
return (
decode_visible_face_yaw_from_prediction(pred_41, pred_edge_60, anchor_xy, stride, fallback_face_type, calib)
if fallback_face_type in range(4)
else float("nan")
)
inferred_img_w = float(img_w) if img_w is not None else None
if inferred_img_w is None:
if bbox_xyxy is not None:
inferred_img_w = max(float(np.asarray(bbox_xyxy, dtype=np.float64)[2]), 1.0)
else:
inferred_img_w = max(float((anchor_xy[0] + pred_41[25]) * stride) * 2.0, 1.0)
selection = decode_edge_yaw_selection_from_prediction(
pred_41,
pred_edge_60,
anchor_xy,
stride,
calib,
score_thr=score_thr,
bbox_xyxy=bbox_xyxy,
img_w=inferred_img_w,
)
if selection.get("two_face_eligible") and np.isfinite(selection.get("yaw", float("nan"))):
return float(selection["yaw"])
face_edges_3d, face_scores = {}, {}
for face_type, score in select_pred_visible_faces_for_decode(
pred_41, score_thr=score_thr, bbox_xyxy=bbox_xyxy, img_w=inferred_img_w
):
decoded = decode_visible_face_edge_from_prediction(pred_edge_60, face_type, anchor_xy, stride)
points_3d = _decoded_edge_to_points_3d(decoded, calib)
if points_3d is None:
continue
face_edges_3d[face_type] = points_3d
face_scores[face_type] = float(score)
if fallback_face_type in range(4):
return decode_visible_face_yaw_from_prediction(pred_41, pred_edge_60, anchor_xy, stride, fallback_face_type, calib)
return visible_face_edges_to_yaw(face_edges_3d, face_scores=face_scores)
def decode_multi_visible_face_yaw_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
fallback_face_type=None,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
bbox_xyxy=None,
):
"""Decode visible-face yaw from GT edge geometry with the same direct two-edge logic."""
face_edges_3d, face_scores = {}, {}
for face_type, face in select_gt_visible_faces(target_42, score_thr=score_thr):
decoded = decode_visible_face_edge_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
face_type=face_type,
score_thr=score_thr,
bbox_xyxy=bbox_xyxy,
)
if decoded is None:
continue
face_edges_3d[decoded["face_type"]] = decoded["points_3d"]
face_scores[decoded["face_type"]] = float(face[6])
partial_edge = decode_cut_partial_side_edge_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
bbox_xyxy=bbox_xyxy,
score_thr=score_thr,
)
if partial_edge is not None:
face_edges_3d[partial_edge["face_type"]] = partial_edge["points_3d"]
face_scores[partial_edge["face_type"]] = max(face_scores.get(partial_edge["face_type"], 0.0), 1.0)
if len(face_edges_3d) >= 2:
yaw = visible_face_edges_to_yaw(face_edges_3d, face_scores=face_scores)
if np.isfinite(yaw):
return yaw
if fallback_face_type in range(4):
return decode_visible_face_yaw_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
fallback_face_type,
score_thr=score_thr,
bbox_xyxy=bbox_xyxy,
)
return visible_face_edges_to_yaw(face_edges_3d, face_scores=face_scores)
def _back_project_metric_point(u, v, z, calib):
"""Back-project a metric point to 3D center coordinates."""
if calib is not None and z > 0:
center_3d = back_project_2d_to_3d((u, v), z, calib)
if center_3d is None:
x3d, y3d = float("nan"), float("nan")
else:
x3d, y3d = center_3d[0], center_3d[1]
else:
x3d, y3d = float("nan"), float("nan")
return np.array([x3d, y3d, z], dtype=np.float32)
def select_gt_visible_faces(target_42, score_thr=FACE_VISIBILITY_SCORE_THRESH):
"""Return GT-visible faces eligible for face-based metrics."""
selected = []
for face_type, off in enumerate(FACE_OFFSETS_42):
face = target_42[off : off + 8]
is_vis, score = face[7], face[6]
if is_vis != 1 or np.isnan(score) or score < score_thr:
continue
if np.isnan(face[2]) or face[2] <= 0:
continue
selected.append((face_type, face))
return selected
def select_pred_visible_faces(pred_41, score_thr=FACE_VISIBILITY_SCORE_THRESH):
"""Return predicted visible faces whose scores clear the face-metric threshold."""
selected = []
for face_type, off in enumerate(FACE_OFFSETS_41):
score = float(pred_41[off + 5])
if np.isnan(score) or score < score_thr:
continue
selected.append((face_type, score))
return selected
def is_gt_face_cut(target_42, face_type):
"""Return whether a GT face was invalidated by crop handling."""
if face_type not in range(4):
return False
off = FACE_OFFSETS_42[face_type]
face = target_42[off : off + 8]
return np.all(face[:6] == -1) and face[7] <= 0
def is_gt_cut_object(target_42):
"""Return whether a GT face-based object is labeled as cut-in or cut-out."""
f_cut = is_gt_face_cut(target_42, 0)
r_cut = is_gt_face_cut(target_42, 1)
l_cut = is_gt_face_cut(target_42, 2)
ri_cut = is_gt_face_cut(target_42, 3)
return (r_cut and l_cut and ri_cut) or (f_cut and l_cut and ri_cut)
def extract_3d_attrs_from_prediction(pred_41, anchor_xy, stride, calib, face_type=None, pred_edge_60=None):
"""Extract raw 3D attributes from a single 41-dim denormalized prediction.
Args:
pred_41: Denormalized prediction.
anchor_xy: Anchor point in grid coordinates.
stride: Anchor stride.
calib: Per-sample calibration.
face_type: Optional face index (0-3). When provided, decode depth/UV from the matching face branch.
pred_edge_60: Optional denormalized auxiliary edge prediction aligned to the same anchor.
Returns:
Dict with center, depth, dims, yaw, uv, and edge_yaw, or None if the requested branch is invalid.
"""
p = pred_41
rot_y = _decode_yaw_from_prediction(p)
dims = p[27:30].astype(np.float32)
if face_type is None:
z = float(p[24])
uv_offset = p[25:27]
edge_yaw = float("nan")
else:
off = FACE_OFFSETS_41[face_type]
z = float(p[off])
uv_offset = p[off + 1 : off + 3]
edge_yaw = decode_multi_visible_face_yaw_from_prediction(
p,
pred_edge_60,
anchor_xy,
stride,
calib,
fallback_face_type=face_type,
)
u = float((anchor_xy[0] + uv_offset[0]) * stride)
v = float((anchor_xy[1] + uv_offset[1]) * stride)
center = _back_project_metric_point(u, v, z, calib)
return {
"center": center,
"depth": z,
"dims": dims,
"yaw": float(rot_y),
"edge_yaw": float(edge_yaw),
"uv": np.array([u, v], dtype=np.float32),
"visible_face_type": None if face_type is None else int(face_type),
"face_center": None if face_type is None else center,
}
def face_center_from_corners(corners_3d, face_type):
"""Return the center point of one face from 3D box corners."""
if corners_3d is None or face_type not in FACE_CORNERS:
return None
corners = np.asarray(corners_3d, dtype=np.float32)
if corners.shape != (8, 3) or not np.isfinite(corners).all():
return None
return corners[list(FACE_CORNERS[face_type])].mean(axis=0)
def rebuild_box_corners_for_visualization(
corners_3d,
dims,
yaw,
visible_face_type=None,
face_center_3d=None,
box_center_3d=None,
):
"""Rebuild box corners for visualization while preserving the appropriate anchor.
Face-based objects stay anchored on the selected visible face center. Whole-box objects stay anchored
on the geometric box center.
"""
dims_arr = np.asarray(dims, dtype=np.float32)
if dims_arr.shape != (3,) or not np.isfinite(dims_arr).all() or not np.isfinite(float(yaw)):
return None
if visible_face_type is not None:
if face_center_3d is None:
face_center_3d = face_center_from_corners(corners_3d, int(visible_face_type))
else:
face_center_3d = np.asarray(face_center_3d, dtype=np.float32)
if face_center_3d is None or face_center_3d.shape != (3,) or not np.isfinite(face_center_3d).all():
return None
return compute_3d_box_corners(face_center_3d, dims_arr, float(yaw), face_type=int(visible_face_type))
if box_center_3d is not None:
box_center_3d = np.asarray(box_center_3d, dtype=np.float32)
if box_center_3d.shape != (3,) or not np.isfinite(box_center_3d).all():
return None
return compute_3d_box_corners(box_center_3d, dims_arr, float(yaw), face_type=-1)
corners = np.asarray(corners_3d, dtype=np.float32)
if corners.shape != (8, 3) or not np.isfinite(corners).all():
return None
return compute_3d_box_corners(corners.mean(axis=0), dims_arr, float(yaw), face_type=-1)
def extract_3d_attrs_from_gt(
target_42,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
face_type=None,
score_thr=FACE_VISIBILITY_SCORE_THRESH,
):
"""Extract raw 3D attributes from a single 42-dim GT label.
Args:
target_42: GT 42-dim label.
cls_id: Integer class ID.
calib: Per-sample calibration.
img_w: Image width in pixels.
img_h: Image height in pixels.
face_3d_classes: Class IDs that use face annotations.
complete_3d_classes: Class IDs with whole-box-only 3D labels.
face_type: Optional face index (0-3). When provided, decode only that GT-visible face.
score_thr: Minimum visible-face score used to treat a GT face as valid.
Returns:
Dict with center, depth, dims, yaw, uv, and edge_yaw, or None if the requested representation is invalid.
"""
t = target_42
z3d = t[2]
if np.isnan(z3d) or z3d <= 0:
return None
if cls_id not in face_3d_classes and cls_id not in complete_3d_classes:
return None
depth_scale = calib.get("depth_scale", 1.0) if calib else 1.0
dims = t[3:6].astype(np.float32)
rot_y = float(t[6])
edge_yaw = float("nan")
if face_type is None:
z = float(z3d * depth_scale)
u = float(t[7] * img_w)
v = float(t[8] * img_h)
else:
if cls_id not in face_3d_classes or face_type not in range(4):
return None
face = t[FACE_OFFSETS_42[face_type] : FACE_OFFSETS_42[face_type] + 8]
is_vis, score = face[7], face[6]
if is_vis != 1 or np.isnan(score) or score < score_thr:
return None
if np.isnan(face[2]) or face[2] <= 0:
return None
z = float(face[2] * depth_scale)
u = float(face[4] * img_w)
v = float(face[5] * img_h)
edge_yaw = decode_multi_visible_face_yaw_from_gt(
t,
cls_id,
calib,
img_w,
img_h,
face_3d_classes,
complete_3d_classes,
fallback_face_type=face_type,
score_thr=score_thr,
)
center = _back_project_metric_point(u, v, z, calib)
return {
"center": center,
"depth": z,
"dims": dims,
"yaw": rot_y,
"edge_yaw": float(edge_yaw),
"uv": np.array([u, v], dtype=np.float32),
"visible_face_type": None if face_type is None else int(face_type),
"face_center": None if face_type is None else center,
}
# ---- Bird's Eye View (BEV) visualization ----
def draw_bev_blank(max_range=200, lateral_range=50):
"""Create blank BEV canvas with distance grid.
Args:
max_range: Forward range in meters.
lateral_range: Lateral range in meters (±lateral_range).
Returns:
(bev_img, pixels_per_meter, ego_center_x, ego_center_y) tuple.
"""
ppm = 10 # pixels per meter
w = lateral_range * 2 * ppm
h = max_range * ppm
bev = np.ones((h, w, 3), dtype=np.uint8) * 40 # dark gray background
ego_cx = w // 2
ego_cy = h # bottom center
# Draw grid lines
for d in range(0, max_range + 1, 20):
y = ego_cy - d * ppm
if 0 <= y < h:
cv2.line(bev, (0, y), (w, y), (80, 80, 80), 1)
cv2.putText(bev, f"{d}m", (5, y - 3), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (150, 150, 150), 1)
for l in range(-lateral_range, lateral_range + 1, 10):
x = ego_cx + l * ppm
if 0 <= x < w:
cv2.line(bev, (x, 0), (x, h), (80, 80, 80), 1)
# Ego vehicle marker
cv2.rectangle(bev, (ego_cx - 8, ego_cy - 20), (ego_cx + 8, ego_cy), (255, 200, 0), -1)
return bev, ppm, ego_cx, ego_cy
def draw_bev_object(bev_img, center_3d, dims, rot_y, ppm, ego_cx, ego_cy, is_pred=True):
"""Draw a single object on BEV image.
Args:
bev_img: BEV canvas image.
center_3d: (x, y, z) in camera coordinates (x=right, z=forward).
dims: (l, h, w) dimensions.
rot_y: Rotation angle in radians.
ppm: Pixels per meter.
ego_cx: Ego center x in pixels.
ego_cy: Ego center y in pixels.
is_pred: True for predictions (red), False for GT (green).
"""
x, _, z = center_3d
l, _, w = dims
if not (np.isfinite(x) and np.isfinite(z) and z > 0):
return
# Camera coords: x=right, z=forward → BEV: right=+x, up=+z
bev_x = int(ego_cx + x * ppm)
bev_y = int(ego_cy - z * ppm)
if not (0 <= bev_x < bev_img.shape[1] and 0 <= bev_y < bev_img.shape[0]):
return
color = (0, 0, 255) if is_pred else (0, 200, 0) # Red for pred, green for GT
# Draw rotated rectangle
rect = ((bev_x, bev_y), (int(w * ppm), int(l * ppm)), -np.degrees(rot_y))
box_pts = cv2.boxPoints(rect).astype(np.intp)
cv2.drawContours(bev_img, [box_pts], 0, color, 2)
# Arrow showing forward direction
dx = int(l * 0.5 * ppm * np.sin(rot_y))
dy = int(-l * 0.5 * ppm * np.cos(rot_y))
cv2.arrowedLine(bev_img, (bev_x, bev_y), (bev_x + dx, bev_y + dy), color, 1, tipLength=0.3)
def create_bev_image(gt_3d_attrs_list, pred_3d_attrs_list, max_range=200, lateral_range=50):
"""Create BEV visualization with GT (green) and predictions (red).
Args:
gt_3d_attrs_list: List of dicts with center, dims, yaw (from extract_3d_attrs_from_gt).
pred_3d_attrs_list: List of dicts with center, dims, yaw (from extract_3d_attrs_from_prediction).
max_range: Forward range in meters.
lateral_range: Lateral range in meters.
Returns:
RGB numpy image (H, W, 3).
"""
bev, ppm, ego_cx, ego_cy = draw_bev_blank(max_range, lateral_range)
# Draw GT objects (green, draw first so predictions overlay)
for attrs in gt_3d_attrs_list:
if attrs is not None:
draw_bev_object(bev, attrs["center"], attrs["dims"], attrs["yaw"],
ppm, ego_cx, ego_cy, is_pred=False)
# Draw predicted objects (red)
for attrs in pred_3d_attrs_list:
if attrs is not None:
draw_bev_object(bev, attrs["center"], attrs["dims"], attrs["yaw"],
ppm, ego_cx, ego_cy, is_pred=True)
# Add legend
cv2.putText(bev, "GT", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 200, 0), 2)
cv2.putText(bev, "Pred", (10, 45), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
return cv2.cvtColor(bev, cv2.COLOR_BGR2RGB)