Files
yolov26_3d/ultralytics/cfg/datasets/mono3d_ground.yaml
2026-06-24 09:35:46 +08:00

85 lines
4.3 KiB
YAML
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
# Ground 3D Detection Dataset for Mono3D
# Joint 2D + 3D detection with fisheye camera support
# Label format: 19-col (complete_3d) or 51-col (face_3d) per object
# Train/val/test sets as 1) dir: path/to/imgs, 2) file: path/to/imgs.txt, or 3) list: [path/to/imgs1, path/to/imgs2, ..]
path: /mnt/nfs/mono3d/xdzhu_data/Mono3d/Mono3d_4face_2m_g1m3/driving_png_20260202 # dataset root dir of images
train: /mnt/nfs/mono3d/xdzhu_data/Mono3d/Mono3d_4face_2m_g1m3/driving_png_20260320/train.txt # train GT list
val: /mnt/nfs/mono3d/xdzhu_data/Mono3d/Mono3d_4face_2m_g1m3/driving_png_20260320/val.txt # val GT list
test: # test images (optional)
# Class mapping: string class names to numeric IDs
# Format: class_name: class_id (allows easy merging, e.g., car: 0, van: 0)
class_map:
car: 0
suv: 1
pickup: 2
medium_car: 3
van: 4
bus: 5
truck: 6
tanker: 6
large_truck: 6
construction_vehicle: 6
special_vehicle: 7
unknown: 8
pedestrian: 9
bicyclist: 10
motorcyclist: 10
bicycle: 11
motorcycle: 11
tricycle: 12
tricyclist: 12
traffic_sign: 13
wheel: 14
plate: 15
face: 16
car_fake: 17
bicyclist_fake: 18
pedestrian_fake: 19
car_carrier: 6
platform_truck: 6
# Training parameters
min_wh: 8.0 # Keep boxes whose width or height is at least this many pixels
# Color space of input images
use_yuv444: false # Standard RGB/BGR images (not YUV444)
# Difficulty-based loss weighting:
difficulty_weights: [1.0, 1.0, 0.7, 0.3]
# 3D Detection parameters
# Class groups for 3D label parsing (by mapped class ID)
face_3d_classes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 17] # vehicles with 4-face annotations (51-col labels)
complete_3d_classes: [9, 10, 11, 12, 18, 19] # pedestrian/bike with whole-box 3D only (19-col labels)
fake_3d_classes: [17, 18, 19] # fake classes with an additional dedicated 3D prediction head
# Camera
ori_img_size: [1920, 1080] # Original image size [width, height]
# ROI-specific presets resolved by train_mono3d.py via --roi=<name>
default_roi: roi0
roi_configs:
roi0:
roi: [1920, 880] # ROI size [width, height], crop from [w//2, vanishing_point_y] 保留整宽,只裁掉一部分上下区域,适合大视野。
virtual_fx: 537 # Target focal length x for virtual camera (pixels) 537 本身不是代码实时算出来的,是人为选定的目标焦距/归一化基准。一般来自目标输入尺寸、ROI 后典型焦距或历史模型设定
virtual_camera_prob: -1.0 # -1 = always use virtual_fx only, >0 = probability of virtual camera augmentation 不会走 virtual-camera 分支,只走普通 ROI crop + resize + depth normalize
crop_center_mode: cxvy # Crop center mode: cxvy = crop around image center x and vanishing point y
roi1:
roi: [768, 352] # ROI size [width, height], crop from [vanishing_point_x, vanishing_point_y]
virtual_fx: 537 # Target focal length x for virtual camera (pixels)
virtual_camera_prob: 0.5 # 有50% 概率走 virtual-camera 分支
virtual_camera_val_zoom: true # Enable virtual-camera random zoom during validation/TensorBoard while keeping crop center fixed
crop_center_mode: vxvy # Crop center mode: vxvy = crop around vanishing point x and vanishing point y
# 3D Normalization scales (for loss computation) 通常应来自离线统计,比如训练集 z 深度和尺寸的均值/尺度,或者沿用历史模型经验值 【1. 给 3D head 一个合理物理量先验; 2. 让 z、size loss 的量级更稳定; 3. 保持训练输出和 decode/可视化使用同一套尺度】
norm_scales_3d:
z3d_scale: 24.415 # z3d_norm = (z3d - z3d_offset) / z3d_scale 真实深度为39.937 米时模型预测0 真值每多预测24.415米,模型预测值+1
z3d_offset: 39.937
size_scale: 1.945 # size_norm = (size - size_offset) / size_scale 长度为3.78米时模型预测0 真值每多1.945米,模型预测值+1
size_offset: 3.780
yaw_scale: 1.5707963 # pi/2