yolov26_3d/eval_tools/evaluator/parser.py

"""
Data parser for ground truth and detection results.
Supports both TXT and JSON formats.

TXT GT format:  normalized xywh bbox + 47-dim 3D labels (47-dim label format per CLAUDE.md)
JSON GT format: absolute pixel box2d, 3d_ori=[x3d,y3d,z3d,l,h,w,rot_y,xc,yc,...], 3d_front/back/left/right=[x3d,y3d,z3d,alpha,xc,yc,score,is_visible]

TXT Det format: class_name conf x1 y1 x2 y2 coord_sys x3d y3d z3d l h w rot_y face_type
JSON Det format: type/type_name, score, box2d (absolute pixels), xyzlhwyaw=[x3d,y3d,z3d,l,h,w,rot_y], face_cls
"""
import json
import numpy as np

from ..class_config import CLASS_NAMES, CLASS_NAME_TO_ID, CLASSES_3D, FACE_3D_CLASSES


def yaw_to_radians(yaw_value, coord_system):
    """Convert parsed yaw to radians for downstream evaluation."""
    yaw = float(yaw_value)
    if coord_system == 'ego':
        return float(np.deg2rad(yaw))
    return yaw


class GroundTruthParser:
    """Parse ground truth annotation files."""

    # Class ID to name mapping — imported from eval_tools/class_config.py
    CLASS_NAMES = CLASS_NAMES

    # 3D classes — imported from eval_tools/class_config.py
    CLASSES_3D = CLASSES_3D
    VALID_COORD_SYSTEMS = {"camera", "ego"}

    def __init__(self, min_box_size=8, coord_system='camera'):
        """
        Initialize ground truth parser.

        Args:
            min_box_size: float, minimum bbox width or height in pixels.
                          Boxes smaller than this will be filtered out.
                          Default is 8. Should be calculated based on ROI config:
                          - ROI0 (1920->704): 8 * 1920 / 704 ≈ 21.8
                          - ROI1 (704->704): 8 * 704 / 704 = 8
        """
        self.min_box_size = min_box_size
        if coord_system not in self.VALID_COORD_SYSTEMS:
            raise ValueError(f"Unsupported coord_system: {coord_system}")
        self.coord_system = coord_system

    def parse_line(self, line, img_width, img_height):
        """
        Parse a single line of ground truth annotation.

        Args:
            line: str, annotation line
            img_width: int, image width
            img_height: int, image height

        Returns:
            dict with keys:
                - label: int
                - bbox_2d: [x1, y1, x2, y2] in pixel coordinates
                - has_3d: bool
                - 3d_info: dict or None
        """
        values = [float(x) for x in line.strip().split()]

        if len(values) < 6:
            return None

        label = int(values[0])

        # Parse 2D bbox (normalized center + width/height to pixel corners)
        x_center_norm, y_center_norm = values[1], values[2]
        w_norm, h_norm = values[3], values[4]

        x_center_px = x_center_norm * img_width
        y_center_px = y_center_norm * img_height
        w_px = w_norm * img_width
        h_px = h_norm * img_height

        x1 = x_center_px - w_px / 2
        y1 = y_center_px - h_px / 2
        x2 = x_center_px + w_px / 2
        y2 = y_center_px + h_px / 2

        bbox_2d = [x1, y1, x2, y2]

        # Filter out small objects based on configured minimum size
        bbox_width = x2 - x1
        bbox_height = y2 - y1
        if bbox_width < self.min_box_size or bbox_height < self.min_box_size:
            return None

        # Check if has 3D annotation
        has_3d = self.is_3d_annotated(values)

        result = {
            'label': label,
            'bbox_2d': bbox_2d,
            'has_3d': has_3d,
            '3d_info': None
        }

        if has_3d:
            result['3d_info'] = self._parse_3d_info(values, label)

        return result

    def is_3d_annotated(self, values):
        """Check if the annotation contains 3D information."""
        if len(values) == 6 and values[5] == -1:
            return False
        if len(values) >= 18:
            return True
        return False

    def _parse_3d_info(self, values, label):
        """Parse 3D information from annotation values."""
        info = {
            'center': [values[5], values[6], values[7]],  # x3d_ori, y3d_ori, z3d_ori
            'dimensions': [values[8], values[9], values[10]],  # l3d, h3d, w3d
            'rotation': values[11],  # rot_y
            'faces': None
        }

        # For face_3d_classes, parse face information
        if label in FACE_3D_CLASSES and len(values) == 50:
            info['faces'] = {
                'front': values[18:26],  # x3d, y3d, z3d, alpha, xc, yc, score, is_occ
                'back': values[26:34],
                'left': values[34:42],
                'right': values[42:50]
            }

        return info

    def get_class_name(self, label_id):
        """Get class name from label ID."""
        return self.CLASS_NAMES.get(label_id, "unknown")

    def _should_filter_negative_id_gt(self, entry, label):
        """
        Filter JSON GT objects that should not participate in 3D-class evaluation.

        Rule:
        - Only applies to 3D classes: vehicle, pedestrian, bicycle, rider
        - If GT carries an `id` field and id < 0, drop this GT entirely
        """
        if label not in self.CLASSES_3D:
            return False

        object_id = entry.get('id')
        if object_id is None:
            return False

        try:
            return int(object_id) < 0
        except (ValueError, TypeError):
            return False

    def parse_gt_json_entry(self, entry, img_width, img_height):
        """
        Parse a single entry from a GT JSON file.

        GT JSON entry format:
            {
                "type": "0",           # class id string
                "type_name": "vehicle",
                "roi_id": "1",
                "box2d": ["x1","y1","x2","y2"],   # absolute pixel coords
                "3d_ori": ["x3d","y3d","z3d","l","h","w","rot_y","xc","yc",...,"alpha","flag"],
                "3d_front": ["x3d","y3d","z3d","alpha","xc","yc","score","is_visible"],
                "3d_back":  [...],
                "3d_left":  [...],
                "3d_right": [...]
            }

        Args:
            entry: dict, single GT JSON entry
            img_width: int, image width (unused for JSON, bbox already in pixels)
            img_height: int, image height (unused for JSON, bbox already in pixels)

        Returns:
            dict or None
        """
        raw_type = entry.get('type')
        if raw_type is None or str(raw_type).strip().lower() in ('', 'none', 'null'):
            return None
        try:
            label = int(raw_type)
        except (ValueError, TypeError):
            return None

        if self._should_filter_negative_id_gt(entry, label):
            return None

        box2d = entry.get('box2d', [])
        if len(box2d) < 4:
            return None
        x1, y1, x2, y2 = float(box2d[0]), float(box2d[1]), float(box2d[2]), float(box2d[3])
        bbox_2d = [x1, y1, x2, y2]

        # Filter small objects
        if (x2 - x1) < self.min_box_size or (y2 - y1) < self.min_box_size:
            return None

        # Check whether 3D annotation is present and valid
        ori_key = '3d_ori_ego' if self.coord_system == 'ego' else '3d_ori'
        if self.coord_system == 'ego' and ori_key not in entry and '3d_ori' in entry:
            has_camera_3d = False
            try:
                has_camera_3d = len(entry['3d_ori']) >= 7 and float(entry['3d_ori'][0]) != -1
            except (ValueError, TypeError):
                has_camera_3d = False
            if has_camera_3d:
                raise ValueError(
                    "GT JSON is missing ego-coordinate fields (3d_ori_ego). "
                    "Please regenerate ground truth with ego fields before running ego-coordinate evaluation."
                )
        d3_ori = entry.get(ori_key)
        has_3d = False
        d3_info = None
        if d3_ori is not None and len(d3_ori) >= 7:
            # x3d is d3_ori[0]; -1 indicates no 3D annotation
            try:
                has_3d = float(d3_ori[0]) != -1
            except (ValueError, TypeError):
                has_3d = False

        if has_3d:
            d3_info = self._parse_3d_info_from_json(entry, label)

        return {
            'label': label,
            'bbox_2d': bbox_2d,
            'has_3d': has_3d,
            '3d_info': d3_info,
            'id': entry.get('id'),
        }

    def _parse_3d_info_from_json(self, entry, label):
        """Parse 3D information from a JSON GT entry."""
        ori_key = '3d_ori_ego' if self.coord_system == 'ego' else '3d_ori'
        d3_ori = entry[ori_key]
        info = {
            'center':     [float(d3_ori[0]), float(d3_ori[1]), float(d3_ori[2])],  # x3d, y3d, z3d
            'dimensions': [float(d3_ori[3]), float(d3_ori[4]), float(d3_ori[5])],  # l, h, w
            'rotation':    yaw_to_radians(d3_ori[6], self.coord_system),             # rot_y
            'faces': None,
            'coord_system': self.coord_system,
        }

        # Parse face information for face_3d_classes (vehicle, bus, truck, tanker, unknown)
        face_keys = {'front': '3d_front', 'back': '3d_back', 'left': '3d_left', 'right': '3d_right'}
        if self.coord_system == 'ego':
            face_keys = {name: f"{key}_ego" for name, key in face_keys.items()}
        if label in FACE_3D_CLASSES and all(k in entry for k in face_keys.values()):
            info['faces'] = {}
            for face_name, json_key in face_keys.items():
                face_data = entry[json_key]
                if len(face_data) >= 8:
                    info['faces'][face_name] = [float(v) for v in face_data[:8]]
                else:
                    info['faces'][face_name] = [float(v) for v in face_data]

        return info

    def parse_gt_json_file(self, file_path, img_width, img_height):
        """
        Parse an entire GT JSON file.

        The JSON file is a dict keyed by object index ("0", "1", ...).

        Returns:
            list of parsed annotation dicts
        """
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
        except FileNotFoundError:
            print(f"Warning: File not found: {file_path}")
            return []
        except json.JSONDecodeError as e:
            print(f"Warning: JSON decode error in {file_path}: {e}")
            return []

        if data is None:
            return []

        if isinstance(data, dict):
            items = sorted(data.items(), key=lambda item: int(item[0]) if str(item[0]).isdigit() else str(item[0]))
        elif isinstance(data, list):
            items = list(enumerate(data))
        else:
            print(f"Warning: unsupported GT JSON root type {type(data).__name__} in {file_path}")
            return []

        annotations = []
        for key, entry in items:
            if not isinstance(entry, dict):
                print(f"Warning: skipping non-dict GT entry at key={key!r} in {file_path}")
                continue
            raw_type = entry.get('type')
            if raw_type is None or str(raw_type).strip().lower() in ('', 'none', 'null'):
                print(f"Warning: skipping entry with invalid type={raw_type!r} "
                      f"(key={key!r}) in {file_path}")
                continue
            parsed = self.parse_gt_json_entry(entry, img_width, img_height)
            if parsed is not None:
                annotations.append(parsed)
        return annotations

    def parse_file(self, file_path, img_width, img_height):
        """
        Parse entire annotation file. Dispatches to JSON or TXT parser based on extension.

        Args:
            file_path: str, path to annotation file (.txt or .json)
            img_width: int, image width
            img_height: int, image height

        Returns:
            list of parsed annotations
        """
        if str(file_path).endswith('.json'):
            return self.parse_gt_json_file(file_path, img_width, img_height)

        annotations = []
        try:
            with open(file_path, 'r') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    parsed = self.parse_line(line, img_width, img_height)
                    if parsed is not None:
                        annotations.append(parsed)
        except FileNotFoundError:
            print(f"Warning: File not found: {file_path}")
            return []

        return annotations


class DetectionParser:
    """Parse detection result files."""

    # Class name to ID mapping — imported from eval_tools/class_config.py
    CLASS_NAME_TO_ID = CLASS_NAME_TO_ID

    # 3D classes — imported from eval_tools/class_config.py
    CLASSES_3D = CLASSES_3D
    VALID_COORD_SYSTEMS = {"camera", "ego"}

    def __init__(self, min_box_size=0, coord_system='camera'):
        """
        Initialize detection parser.

        Args:
            min_box_size: float, minimum bbox width or height in pixels.
                          Detections smaller than this will be filtered out.
                          Should match the GT min_box_size to ensure
                          symmetric filtering. Default is 0 (no filtering).
        """
        self.min_box_size = min_box_size
        if coord_system not in self.VALID_COORD_SYSTEMS:
            raise ValueError(f"Unsupported coord_system: {coord_system}")
        self.coord_system = coord_system

    def parse_line(self, line):
        """
        Parse a single line of detection result.

        Args:
            line: str, detection line

        Returns:
            dict with keys:
                - label: int
                - confidence: float
                - bbox_2d: [x1, y1, x2, y2] in pixel coordinates
                - 3d_info: dict or None
        """
        parts = line.strip().split()

        if len(parts) < 6:
            return None

        class_name = parts[0]
        label = self.map_class_name(class_name)
        confidence = float(parts[1])

        # 2D bbox
        x1, y1, x2, y2 = float(parts[2]), float(parts[3]), float(parts[4]), float(parts[5])
        bbox_2d = [x1, y1, x2, y2]

        # Filter small detections
        if self.min_box_size > 0 and ((x2 - x1) < self.min_box_size or (y2 - y1) < self.min_box_size):
            return None

        result = {
            'label': label,
            'confidence': confidence,
            'bbox_2d': bbox_2d,
            '3d_info': None
        }

        # Check if this is a 3D class and has 3D info
        if label in self.CLASSES_3D and len(parts) >= 15:
            result['3d_info'] = self._parse_3d_info(parts)

        return result

    def _parse_3d_info(self, parts):
        """Parse 3D information from detection parts."""
        if self.coord_system == 'ego':
            raise ValueError("TXT detection format does not support ego-coordinate 3D evaluation.")
        # Format: label conf x1 y1 x2 y2 coord_sys x3d y3d z3d l3d h3d w3d rot_y face_type
        # Index:   0     1    2  3  4  5     6       7   8   9   10  11  12   13    14

        # Get face_type and normalize it
        face_type = parts[14] if len(parts) > 14 else 'whole'
        # Normalize rear/tail to back for consistency
        if face_type.lower() in ['rear', 'tail']:
            face_type = 'back'

        info = {
            'center': [float(parts[7]), float(parts[8]), float(parts[9])],  # x3d, y3d, z3d
            'dimensions': [float(parts[10]), float(parts[11]), float(parts[12])],  # l3d, h3d, w3d
            'rotation': float(parts[13]),  # rot_y
            'face_type': face_type,
            'coord_system': 'camera',
        }

        return info

    def map_class_name(self, name_str):
        """Map class name string to class ID."""
        return self.CLASS_NAME_TO_ID.get(name_str.lower(), -1)

    def parse_det_json_entry(self, entry):
        """
        Parse a single entry from a detection JSON file.

        Det JSON entry format:
            {
                "type": "0",                # class id string
                "type_name": "vehicle",
                "score": "0.93",
                "roi_id": "0",
                "box2d": ["x1","y1","x2","y2"],   # absolute pixel coords
                "xyzlhwyaw": ["x3d","y3d","z3d","l","h","w","rot_y"],
                "face_cls": "front",        # front/tail/rear/left/right/whole/none
                "cut_cls": "0",
                "cut_cls_name": "nocut"
            }

        Returns:
            dict or None
        """
        try:
            label = int(entry['type'])
        except (KeyError, ValueError, TypeError):
            class_name = entry.get('type_name', '')
            label = self.map_class_name(class_name)

        try:
            confidence = float(entry['score'])
        except (KeyError, ValueError, TypeError):
            confidence = 0.0

        box2d = entry.get('box2d', [])
        if len(box2d) < 4:
            return None
        x1, y1, x2, y2 = float(box2d[0]), float(box2d[1]), float(box2d[2]), float(box2d[3])
        bbox_2d = [x1, y1, x2, y2]

        # Filter small detections
        if self.min_box_size > 0 and ((x2 - x1) < self.min_box_size or (y2 - y1) < self.min_box_size):
            return None

        result = {
            'label': label,
            'confidence': confidence,
            'bbox_2d': bbox_2d,
            '3d_info': None,
            'id': entry.get('track_id', entry.get('id')),
            'roi_id': self._normalize_roi_id(entry.get('roi_id')),
        }

        # Parse 3D info for 3D classes
        if label in self.CLASSES_3D:
            xyz_key = 'xyzlhwyaw_ego' if self.coord_system == 'ego' else 'xyzlhwyaw'
            xyzlhwyaw = entry.get(xyz_key, [])
            using_coord_system = self.coord_system
            if len(xyzlhwyaw) < 7 and self.coord_system == 'ego':
                has_camera_3d = False
                camera_xyz = entry.get('xyzlhwyaw', [])
                try:
                    has_camera_3d = len(camera_xyz) >= 7 and str(camera_xyz[0]) != '-1'
                except (ValueError, TypeError):
                    has_camera_3d = False
                if has_camera_3d:
                    raise ValueError(
                        "Detection JSON is missing ego-coordinate fields (xyzlhwyaw_ego). "
                        "Please export ego-coordinate detection results before running ego-coordinate evaluation."
                    )
            if len(xyzlhwyaw) >= 7 and str(xyzlhwyaw[0]) != '-1':
                face_type = entry.get('face_cls', 'whole') or 'whole'
                if face_type.lower() in ('rear', 'tail'):
                    face_type = 'back'
                result['3d_info'] = {
                    'center':     [float(xyzlhwyaw[0]), float(xyzlhwyaw[1]), float(xyzlhwyaw[2])],
                    'dimensions': [float(xyzlhwyaw[3]), float(xyzlhwyaw[4]), float(xyzlhwyaw[5])],
                    'rotation':    yaw_to_radians(xyzlhwyaw[6], using_coord_system),
                    'face_type':   face_type,
                    'coord_system': using_coord_system,
                }

        return result

    @staticmethod
    def _normalize_roi_id(roi_id):
        """Normalize ROI identifiers like 'roi0'/'0' to plain numeric strings."""
        if roi_id is None:
            return None

        roi_id_str = str(roi_id).strip().lower()
        if roi_id_str.startswith('roi'):
            roi_id_str = roi_id_str[3:]
        return roi_id_str or None

    def parse_det_json_file(self, file_path):
        """
        Parse an entire detection JSON file.

        The JSON file is a dict keyed by object index ("0", "1", ...).

        Returns:
            list of parsed detection dicts
        """
        try:
            with open(file_path, 'r') as f:
                data = json.load(f)
        except FileNotFoundError:
            print(f"Warning: File not found: {file_path}")
            return []
        except json.JSONDecodeError as e:
            print(f"Warning: JSON decode error in {file_path}: {e}")
            return []

        detections = []
        for key in sorted(data.keys(), key=lambda k: int(k) if k.isdigit() else k):
            parsed = self.parse_det_json_entry(data[key])
            if parsed is not None:
                detections.append(parsed)
        return detections

    def parse_file(self, file_path):
        """
        Parse entire detection file. Dispatches to JSON or TXT parser based on extension.

        Args:
            file_path: str, path to detection file (.txt or .json)

        Returns:
            list of parsed detections
        """
        if str(file_path).endswith('.json'):
            return self.parse_det_json_file(file_path)

        detections = []
        try:
            with open(file_path, 'r') as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    parsed = self.parse_line(line)
                    if parsed is not None:
                        detections.append(parsed)
        except FileNotFoundError:
            print(f"Warning: File not found: {file_path}")
            return []

        return detections