yolov26_3d/tools/data_mining/convert_txt_to_json.py

#!/usr/bin/env python3
"""
Convert ground truth txt format to JSON format.

Usage:
    python convert_txt_to_json.py <input_txt_file> <output_json_file> [--image-width WIDTH] [--image-height HEIGHT]
"""

import argparse
import json
from pathlib import Path

# from ultralytics.utils import YAML

# DEFAULT_DATA_CONFIG = Path(__file__).resolve().parents[2] / 'ultralytics' / 'cfg' / 'datasets' / 'mono3d_ground.yaml'
DEFAULT_CLASS_MAP = {
    'car': 0,
    'suv': 1,
    'pickup': 2,
    'medium_car': 3,
    'van': 4,
    'bus': 5,
    'truck': 6,
    'tanker': 6,
    'large_truck': 6,
    'construction_vehicle': 6,
    'special_vehicle': 7,
    'unknown': 8,
    'pedestrian': 9,
    'bicyclist': 10,
    'motorcyclist': 10,
    'bicycle': 11,
    'motorcycle': 11,
    'tricycle': 12,
    'tricyclist': 12,
    'traffic_sign': 13,
    'wheel': 14,
    'plate': 15,
    'face': 16,
}

cutcls_map = {
    0: 'nocut',
    1: 'cutin',
    2: 'cutout',
}

EMPTY_3D_ORI = ["-1.0"] * 13
EMPTY_3D_FACE = ["-1.0"] * 8


def load_class_map(data_config_path: str | Path | None = None) -> dict[str, int]:
    """Load class_map from dataset YAML, with a synced fallback for standalone use."""
    # config_path = Path(data_config_path).expanduser().resolve() if data_config_path else DEFAULT_DATA_CONFIG
    # if config_path.exists():
    #     class_map = YAML.load(config_path).get('class_map') or {}
    #     if class_map:
    #         return {str(key): int(value) for key, value in class_map.items()}
    return DEFAULT_CLASS_MAP.copy()


def _stringify(value: float | int) -> str:
    """Convert numeric values to the string form used by evaluator JSON files."""
    return str(value)


def _denormalize_box(x_norm: float, y_norm: float, w_norm: float, h_norm: float, img_width: int, img_height: int) -> list[str]:
    """Convert normalized xywh center box coordinates to absolute xyxy pixel strings."""
    x_center_px = x_norm * img_width
    y_center_px = y_norm * img_height
    w_px = w_norm * img_width
    h_px = h_norm * img_height
    return [
        _stringify(x_center_px - w_px / 2),
        _stringify(y_center_px - h_px / 2),
        _stringify(x_center_px + w_px / 2),
        _stringify(y_center_px + h_px / 2),
    ]


def _empty_3d_result(result: dict) -> dict:
    """Populate a JSON entry with empty 3D fields."""
    result["3d_ori"] = EMPTY_3D_ORI.copy()
    return _empty_3d_faces(result)


def _empty_3d_faces(result: dict) -> dict:
    """Populate a JSON entry with empty face fields."""
    result["3d_front"] = EMPTY_3D_FACE.copy()
    result["3d_back"] = EMPTY_3D_FACE.copy()
    result["3d_left"] = EMPTY_3D_FACE.copy()
    result["3d_right"] = EMPTY_3D_FACE.copy()
    return result


def _build_face(face_values: list[float], img_width: int, img_height: int) -> list[str]:
    """Convert one 8-value face block to evaluator JSON format."""
    return [
        _stringify(face_values[0]),
        _stringify(face_values[1]),
        _stringify(face_values[2]),
        _stringify(face_values[3]),
        _stringify(face_values[4] * img_width),
        _stringify(face_values[5] * img_height),
        _stringify(face_values[6]),
        _stringify(face_values[7]),
    ]


def _extract_occlusion(parts: list[float], ncols: int) -> float:
    """Extract the occlusion attribute from a parsed txt line."""
    if ncols in {6, 19, 51}:
        return int(parts[-1])
    if ncols == 7:
        return int(parts[-2])
    raise ValueError(f"Unsupported label column count {ncols} for occlusion extraction")


def parse_txt_line(line, class_map, img_width=1920, img_height=1080):
    """
    Parse a single line from the txt file and convert to JSON object structure.

    Args:
        line: Single line from txt file
        img_width: Image width for denormalization
        img_height: Image height for denormalization

    Returns:
        Dictionary with parsed data in JSON format
    """

    raw = line.strip().split()
    if len(raw) < 2:
        return None

    label_name = raw[0]
    label = class_map.get(label_name)
    if label is None:
        return None

    try:
        parts = list(map(float, raw[1:]))
    except ValueError:
        return None
    ncols = len(raw)
    if len(parts) < 4:
        return None

    x_norm, y_norm, w_norm, h_norm = parts[0:4]

    result = {
        "type": str(label),
        "type_name": label_name,
        "roi_id": "1",
        "occlusion": _stringify(_extract_occlusion(parts, ncols)),
        "box2d": _denormalize_box(x_norm, y_norm, w_norm, h_norm, img_width, img_height),
    }

    if ncols in {6, 7}:
        return _empty_3d_result(result)

    if ncols == 19:
        x3d_ori, y3d_ori, z3d_ori = parts[4:7]
        l3d, h3d, w3d = parts[7:10]
        rot_y = parts[10]
        xc_ori, yc_ori = parts[11:13]
        xc_ori_d, yc_ori_d = parts[13:15]
        alpha_ori = parts[15]
        flag = parts[16]

        result["3d_ori"] = [
            _stringify(x3d_ori),
            _stringify(y3d_ori),
            _stringify(z3d_ori),
            _stringify(l3d),
            _stringify(h3d),
            _stringify(w3d),
            _stringify(rot_y),
            _stringify(xc_ori * img_width),
            _stringify(yc_ori * img_height),
            _stringify(xc_ori_d * img_width),
            _stringify(yc_ori_d * img_height),
            _stringify(alpha_ori),
            _stringify(int(flag) if float(flag).is_integer() else flag),
        ]
        return _empty_3d_faces(result)

    if ncols == 51:
        x3d_ori, y3d_ori, z3d_ori = parts[4:7]
        l3d, h3d, w3d = parts[7:10]
        rot_y = parts[10]
        xc_ori, yc_ori = parts[11:13]
        xc_ori_d, yc_ori_d = parts[13:15]
        alpha_ori = parts[15]
        flag = parts[16]

        result["3d_ori"] = [
            _stringify(x3d_ori),
            _stringify(y3d_ori),
            _stringify(z3d_ori),
            _stringify(l3d),
            _stringify(h3d),
            _stringify(w3d),
            _stringify(rot_y),
            _stringify(xc_ori * img_width),
            _stringify(yc_ori * img_height),
            _stringify(xc_ori_d * img_width),
            _stringify(yc_ori_d * img_height),
            _stringify(alpha_ori),
            _stringify(int(flag) if float(flag).is_integer() else flag),
        ]
        result["3d_front"] = _build_face(parts[17:25], img_width, img_height)
        result["3d_back"] = _build_face(parts[25:33], img_width, img_height)
        result["3d_left"] = _build_face(parts[33:41], img_width, img_height)
        result["3d_right"] = _build_face(parts[41:49], img_width, img_height)
        return result

    raise ValueError(f"Unsupported label column count {ncols} for line: {line}")


def _resolve_convert_args(class_map_or_img_width, img_width, img_height, data_config_path):
    """Support both legacy convert_txt_to_json(txt, json, w, h) and current class_map-based calls."""
    if isinstance(class_map_or_img_width, dict):
        return class_map_or_img_width, int(img_width), int(img_height)

    if isinstance(class_map_or_img_width, (int, float)) and not isinstance(class_map_or_img_width, bool):
        return load_class_map(data_config_path), int(class_map_or_img_width), int(img_width)

    if class_map_or_img_width is None:
        return load_class_map(data_config_path), int(img_width), int(img_height)

    raise TypeError("class_map_or_img_width must be a class_map dict, image width, or None")


def convert_txt_to_json(
    txt_file,
    json_file,
    class_map_or_img_width=None,
    img_width=1920,
    img_height=1080,
    data_config_path: str | Path | None = None,
):
    """
    Convert txt ground truth file to JSON format.

    Args:
        txt_file: Path to input txt file
        json_file: Path to output JSON file
        img_width: Image width for denormalization
        img_height: Image height for denormalization
    """
    txt_path = Path(txt_file)
    json_path = Path(json_file)
    class_map, img_width, img_height = _resolve_convert_args(
        class_map_or_img_width,
        img_width,
        img_height,
        data_config_path,
    )

    if not txt_path.exists():
        raise FileNotFoundError(f"Input file not found: {txt_file}")

    # Read txt file
    with open(txt_path, 'r') as f:
        lines = f.readlines()

    # Parse each line and build JSON structure
    json_data = {}
    for idx, line in enumerate(lines):
        line = line.strip()
        if not line:  # Skip empty lines
            continue

        obj_data = parse_txt_line(line, class_map, img_width, img_height)
        if obj_data:
            json_data[str(idx)] = obj_data

    # Write JSON file
    with open(json_path, 'w') as f:
        json.dump(json_data, f, indent=4)

    print(f"Converted {len(json_data)} objects from {txt_file} to {json_file}")
    print(f"Image dimensions used: {img_width}x{img_height}")


def main():
    parser = argparse.ArgumentParser(
        description='Convert ground truth txt format to JSON format'
    )
    parser.add_argument('input_txt', help='Input txt file path')
    parser.add_argument('output_json', help='Output JSON file path')
    parser.add_argument('--image-width', type=int, default=1920,
                       help='Image width for denormalization (default: 1920)')
    parser.add_argument('--image-height', type=int, default=1080,
                       help='Image height for denormalization (default: 1080)')
    parser.add_argument(
        '--data-config',
        type=str,
        default='', #str(DEFAULT_DATA_CONFIG),
        help='Dataset YAML path used to load class_map (default: mono3d_ground.yaml)',
    )

    args = parser.parse_args()
    convert_txt_to_json(
        args.input_txt,
        args.output_json,
        class_map_or_img_width=None,
        img_width=args.image_width,
        img_height=args.image_height,
        data_config_path=args.data_config,
    )


if __name__ == '__main__':
    main()