yolov26_3d/ultralytics/utils/metrics_3d.py

# Ultralytics AGPL-3.0 License - https://ultralytics.com/license

"""3D detection metrics for monocular 3D object detection.

Ported from yolov5-3d/utils/metrics.py. Provides depth error, orientation error,
3D center error, UV error, and grouped aggregation for matched prediction-GT pairs.
"""

import math

import numpy as np


def compute_depth_error(pred_depth, gt_depth, eps=1e-7):
    """Compute depth error metrics between predicted and ground truth depth.

    Args:
        pred_depth: Predicted depth values (N,).
        gt_depth: Ground truth depth values (N,).
        eps: Small value to avoid division by zero.

    Returns:
        Dict with abs_error, rel_error, rmse.
    """
    pred_depth = np.asarray(pred_depth, dtype=np.float64)
    gt_depth = np.asarray(gt_depth, dtype=np.float64)

    valid = np.isfinite(pred_depth) & np.isfinite(gt_depth) & (gt_depth > 0)
    if not np.any(valid):
        return {"abs_error": 0.0, "rel_error": 0.0, "rmse": 0.0}

    p, g = pred_depth[valid], gt_depth[valid]
    abs_err = np.abs(p - g)
    return {
        "abs_error": float(np.mean(abs_err)),
        "rel_error": float(np.mean(abs_err / (g + eps))),
        "rmse": float(np.sqrt(np.mean((p - g) ** 2))),
    }


def compute_orientation_error(pred_yaw, gt_yaw):
    """Compute mean absolute orientation error in degrees with wrap-around handling.

    Args:
        pred_yaw: Predicted yaw angles in radians (N,).
        gt_yaw: Ground truth yaw angles in radians (N,).

    Returns:
        Mean absolute orientation error in degrees.
    """
    pred_yaw = np.asarray(pred_yaw, dtype=np.float64)
    gt_yaw = np.asarray(gt_yaw, dtype=np.float64)

    valid = np.isfinite(pred_yaw) & np.isfinite(gt_yaw)
    if not np.any(valid):
        return 0.0

    diff = np.abs(pred_yaw[valid] - gt_yaw[valid])
    diff = np.minimum(diff, 2 * math.pi - diff)
    return float(np.mean(np.degrees(diff)))


def count_valid_orientation_pairs(pred_yaw, gt_yaw):
    """Count valid orientation pairs with finite prediction and ground-truth yaw."""
    pred_yaw = np.asarray(pred_yaw, dtype=np.float64)
    gt_yaw = np.asarray(gt_yaw, dtype=np.float64)
    return int(np.sum(np.isfinite(pred_yaw) & np.isfinite(gt_yaw)))


def compute_visible_orientation_metrics(pred_direct_yaw, pred_edge_yaw, gt_yaw):
    """Compute visible-face direct and edge orientation errors against GT yaw."""
    gt_yaw = np.asarray(gt_yaw, dtype=np.float64)
    pred_direct_yaw = np.asarray(pred_direct_yaw, dtype=np.float64)
    pred_edge_yaw = np.asarray(pred_edge_yaw, dtype=np.float64)

    return {
        "direct_orient_visible": compute_orientation_error(pred_direct_yaw, gt_yaw),
        "edge_orient_visible": compute_orientation_error(pred_edge_yaw, gt_yaw),
        "_direct_orient_visible_matched": count_valid_orientation_pairs(pred_direct_yaw, gt_yaw),
        "_edge_orient_visible_matched": count_valid_orientation_pairs(pred_edge_yaw, gt_yaw),
    }


def compute_3d_center_error(pred_center, gt_center):
    """Compute mean Euclidean distance between predicted and GT 3D centers.

    Args:
        pred_center: Predicted 3D centers (N, 3).
        gt_center: Ground truth 3D centers (N, 3).

    Returns:
        Mean Euclidean distance in meters.
    """
    pred_center = np.asarray(pred_center, dtype=np.float64)
    gt_center = np.asarray(gt_center, dtype=np.float64)

    valid = np.all(np.isfinite(pred_center), axis=1) & np.all(np.isfinite(gt_center), axis=1)
    if not np.any(valid):
        return 0.0

    return float(np.mean(np.linalg.norm(pred_center[valid] - gt_center[valid], axis=1)))


def compute_size_error(pred_dims, gt_dims):
    """Compute mean absolute size error for L, H, W dimensions.

    Args:
        pred_dims: Predicted dimensions (N, 3) - [l, h, w].
        gt_dims: Ground truth dimensions (N, 3) - [l, h, w].

    Returns:
        Mean absolute size error in meters.
    """
    pred_dims = np.asarray(pred_dims, dtype=np.float64)
    gt_dims = np.asarray(gt_dims, dtype=np.float64)

    valid = np.all(np.isfinite(pred_dims), axis=1) & np.all(np.isfinite(gt_dims), axis=1)
    if not np.any(valid):
        return 0.0

    return float(np.mean(np.abs(pred_dims[valid] - gt_dims[valid])))


def compute_uv_error(pred_uv, gt_uv):
    """Compute mean per-coordinate L1 pixel error between predicted and GT UV coordinates.

    This matches the training-time UV logging, which averages absolute U and V errors
    instead of using Euclidean point distance.
    """
    pred_uv = np.asarray(pred_uv, dtype=np.float64)
    gt_uv = np.asarray(gt_uv, dtype=np.float64)

    valid = np.all(np.isfinite(pred_uv), axis=1) & np.all(np.isfinite(gt_uv), axis=1)
    if not np.any(valid):
        return 0.0

    diff = np.abs(pred_uv[valid] - gt_uv[valid])
    return float(np.mean(diff))


def empty_3d_metrics(include_orient=True, include_size=True, include_uv=True, include_visible_orient=False):
    """Return default 3D metrics used for logging when no matches are available."""
    metrics = {
        "depth_abs": 0.0,
        "depth_rel": 0.0,
        "depth_rmse": 0.0,
        "center": 0.0,
        "matched": 0,
    }
    if include_uv:
        metrics["uv"] = 0.0
    if include_orient:
        metrics["orient"] = 0.0
    if include_size:
        metrics["size"] = 0.0
    if include_visible_orient:
        metrics["direct_orient_visible"] = 0.0
        metrics["edge_orient_visible"] = 0.0
    return metrics


def aggregate_3d_metric_groups(stats_by_group):
    """Aggregate grouped 3D metrics with matched-count weighting."""
    aggregated = {}
    for group, entries in stats_by_group.items():
        if not entries:
            aggregated[group] = empty_3d_metrics(
                include_orient=group == "whole",
                include_size=group == "whole",
                include_visible_orient=group == "face",
            )
            continue

        template = {k: 0.0 for k in entries[0] if k != "matched" and not k.startswith("_")}
        template["matched"] = 0
        total_matched = sum(entry["matched"] for entry in entries)
        total_pos_matched = sum(entry.get("_pos_matched", entry["matched"]) for entry in entries)
        if total_matched <= 0 and total_pos_matched <= 0:
            aggregated[group] = template
            continue

        for key in template:
            if key == "matched":
                continue
            if key in {"depth_abs", "depth_rel", "depth_rmse", "center", "uv"}:
                weight_key = "_pos_matched"
            elif key == "direct_orient_visible":
                weight_key = "_direct_orient_visible_matched"
            elif key == "edge_orient_visible":
                weight_key = "_edge_orient_visible_matched"
            else:
                weight_key = "matched"
            total_weight = sum(entry.get(weight_key, entry.get("matched", 0)) for entry in entries)
            if total_weight <= 0:
                template[key] = float("nan") if key in {"direct_orient_visible", "edge_orient_visible"} else 0.0
                continue
            weighted = sum(
                entry[key] * entry.get(weight_key, entry.get("matched", 0))
                for entry in entries
                if entry.get(weight_key, entry.get("matched", 0)) > 0
            )
            template[key] = round(weighted / total_weight, 5)
        template["matched"] = total_matched
        aggregated[group] = template
    return aggregated


def compute_3d_metrics_for_matched(
    pred_3d_attrs,
    gt_3d_attrs,
    include_orient=True,
    include_size=True,
    include_uv=False,
    include_visible_orient=False,
):
    """Compute 3D metrics for pre-matched prediction-GT pairs.

    Args:
        pred_3d_attrs: Dict with keys:
            - center: (N, 3) predicted 3D centers [x, y, z]
            - depth: (N,) predicted z3d
            - yaw: (N,) predicted rotation_y in radians
            - edge_yaw: (N,) predicted visible-face yaw in radians (optional)
            - dims: (N, 3) predicted [l, h, w]
            - uv: (N, 2) predicted [u, v] in pixels (optional)
        gt_3d_attrs: Dict with same keys for ground truth.
        include_orient: Whether to compute orientation error.
        include_size: Whether to compute size error.
        include_uv: Whether to compute UV pixel error.
        include_visible_orient: Whether to compute visible-face direct and edge orientation errors.

    Returns:
        Dict with aggregated metrics and matched count.
    """
    n = len(pred_3d_attrs.get("depth", []))
    if n == 0:
        return empty_3d_metrics(
            include_orient=include_orient,
            include_size=include_size,
            include_uv=include_uv,
            include_visible_orient=include_visible_orient,
        )

    depth_m = compute_depth_error(pred_3d_attrs["depth"], gt_3d_attrs["depth"])
    center_m = compute_3d_center_error(pred_3d_attrs["center"], gt_3d_attrs["center"])
    metrics = {
        "depth_abs": depth_m["abs_error"],
        "depth_rel": depth_m["rel_error"],
        "depth_rmse": depth_m["rmse"],
        "center": center_m,
        "matched": n,
    }
    if include_uv:
        metrics["uv"] = compute_uv_error(pred_3d_attrs["uv"], gt_3d_attrs["uv"])
    if include_orient:
        metrics["orient"] = compute_orientation_error(pred_3d_attrs["yaw"], gt_3d_attrs["yaw"])
    if include_size:
        metrics["size"] = compute_size_error(pred_3d_attrs["dims"], gt_3d_attrs["dims"])
    if include_visible_orient:
        metrics.update(
            compute_visible_orientation_metrics(
                pred_3d_attrs["yaw"],
                pred_3d_attrs.get("edge_yaw", np.full(n, np.nan, dtype=np.float64)),
                gt_3d_attrs["yaw"],
            )
        )
    return metrics