yolov26_3d/ultralytics/cfg/datasets/mono3d_ground.yaml

# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license

# Ground 3D Detection Dataset for Mono3D
# Joint 2D + 3D detection with fisheye camera support
# Label format: 19-col (complete_3d) or 51-col (face_3d) per object

# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: /mnt/nfs/mono3d/xdzhu_data/Mono3d/Mono3d_4face_2m_g1m3/driving_png_20260202  # dataset root dir of images
train: /mnt/nfs/mono3d/xdzhu_data/Mono3d/Mono3d_4face_2m_g1m3/driving_png_20260320/train.txt  # train GT list
val: /mnt/nfs/mono3d/xdzhu_data/Mono3d/Mono3d_4face_2m_g1m3/driving_png_20260320/val.txt  # val GT list
test:  # test images (optional)

# Class mapping: string class names to numeric IDs
# Format: class_name: class_id (allows easy merging, e.g., car: 0, van: 0)
class_map:
  car: 0
  suv: 1
  pickup: 2
  medium_car: 3
  van: 4
  bus: 5
  truck: 6
  tanker: 6
  large_truck: 6
  construction_vehicle: 6
  special_vehicle: 7
  unknown: 8
  pedestrian: 9
  bicyclist: 10
  motorcyclist: 10
  bicycle: 11
  motorcycle: 11
  tricycle: 12
  tricyclist: 12
  traffic_sign: 13
  wheel: 14
  plate: 15
  face: 16
  car_fake: 17
  bicyclist_fake: 18
  pedestrian_fake: 19
  car_carrier: 6
  platform_truck: 6

# Training parameters
min_wh: 8.0  # Keep boxes whose width or height is at least this many pixels

# Color space of input images
use_yuv444: false  # Standard RGB/BGR images (not YUV444)

# Difficulty-based loss weighting:
difficulty_weights: [1.0, 1.0, 0.7, 0.3]

# 3D Detection parameters
# Class groups for 3D label parsing (by mapped class ID)
face_3d_classes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 17]      # vehicles with 4-face annotations (51-col labels)
complete_3d_classes: [9, 10, 11, 12, 18, 19]          # pedestrian/bike with whole-box 3D only (19-col labels)
fake_3d_classes: [17, 18, 19]                         # fake classes with an additional dedicated 3D prediction head

# Camera
ori_img_size: [1920, 1080]  # Original image size [width, height]

# ROI-specific presets resolved by train_mono3d.py via --roi=<name>
default_roi: roi0
roi_configs:
  roi0:
    roi: [1920, 880]              # ROI size [width, height], crop from [w//2, vanishing_point_y]                     保留整宽，只裁掉一部分上下区域，适合大视野。
    virtual_fx: 537               # Target focal length x for virtual camera (pixels)                                 537 本身不是代码实时算出来的，是人为选定的目标焦距/归一化基准。一般来自目标输入尺寸、ROI 后典型焦距或历史模型设定
    virtual_camera_prob: -1.0     # -1 = always use virtual_fx only, >0 = probability of virtual camera augmentation  不会走 virtual-camera 分支，只走普通 ROI crop + resize + depth normalize
    crop_center_mode: cxvy        # Crop center mode: cxvy = crop around image center x and vanishing point y
  roi1:
    roi: [768, 352]               # ROI size [width, height], crop from [vanishing_point_x, vanishing_point_y]
    virtual_fx: 537               # Target focal length x for virtual camera (pixels)
    virtual_camera_prob: 0.5      #                                                                                   有50% 概率走 virtual-camera 分支
    virtual_camera_val_zoom: true # Enable virtual-camera random zoom during validation/TensorBoard while keeping crop center fixed
    crop_center_mode: vxvy        # Crop center mode: vxvy = crop around vanishing point x and vanishing point y

# 3D Normalization scales (for loss computation)                                                                      通常应来自离线统计，比如训练集 z 深度和尺寸的均值/尺度，或者沿用历史模型经验值  【1.  给 3D head 一个合理物理量先验； 2. 让 z、size loss 的量级更稳定； 3. 保持训练输出和 decode/可视化使用同一套尺度】
norm_scales_3d:
  z3d_scale: 24.415       # z3d_norm = (z3d - z3d_offset) / z3d_scale         真实深度为39.937 米时，模型预测0，  真值每多预测24.415米，模型预测值+1
  z3d_offset: 39.937
  size_scale: 1.945       # size_norm = (size - size_offset) / size_scale     长度为3.78米时，模型预测0， 真值每多1.945米，模型预测值+1
  size_offset: 3.780
  yaw_scale: 1.5707963    # pi/2