import json from pathlib import Path import cv2 import numpy as np import pytest from PIL import Image from train_mono3d import resolve_data_yaml_for_roi from ultralytics.data.dataset import Ground3DCalibrationError, YOLOGround3DDataset from ultralytics.data.ground3d_augment import read_calib_from_path from ultralytics.utils import YAML def write_dataset_yaml(path: Path) -> None: YAML.save( file=path, data={ "path": "/tmp/dataset", "train": "train.txt", "val": "val.txt", "class_map": {"car": 0}, "default_roi": "roi0", "roi_configs": { "roi0": { "roi": [1920, 880], "virtual_fx": 537, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, "roi1": { "roi": [768, 352], "virtual_fx": 537, "virtual_camera_prob": 0.5, "virtual_camera_val_zoom": True, "crop_center_mode": "vxvy", }, }, }, ) def write_clip_level_camera4(calib_dir: Path, image_size: tuple[int, int], focal_u: float = 50.0) -> Path: camera4_file = calib_dir / "L2_calib" / "camera4.json" camera4_file.parent.mkdir(parents=True, exist_ok=True) camera4_file.write_text( json.dumps( { "focal_u": focal_u, "focal_v": focal_u, "cu": image_size[0] / 2, "cv": image_size[1] / 2, "pitch": 0.0, "distort_coeffs": [], } ), encoding="utf-8", ) return camera4_file def create_ground3d_dataset( tmp_path: Path, image_sizes: list[tuple[int, int]], imgsz: tuple[int, int] = (64, 32), roi: tuple[int, int] | None = None, ori_img_size: tuple[int, int] | None = None, ) -> tuple[YOLOGround3DDataset, list[str]]: gt_root = tmp_path / "gt" image_root = tmp_path / "dataset" rel_labels = [] image_files = [] roi = roi or imgsz ori_img_size = ori_img_size or imgsz for idx, image_size in enumerate(image_sizes, start=1): rel_label = Path(f"labels/seq{idx}/frame_{idx:04d}.txt") label_file = gt_root / rel_label image_file = image_root / "images" / f"seq{idx}" / f"frame_{idx:04d}.png" clip_calib_dir = gt_root / "calib" / f"seq{idx}" label_file.parent.mkdir(parents=True, exist_ok=True) image_file.parent.mkdir(parents=True, exist_ok=True) label_file.write_text("car 0.5 0.5 0.25 0.25 0\n", encoding="utf-8") Image.new("RGB", image_size, color=(32, 64, 96)).save(image_file) write_clip_level_camera4(clip_calib_dir, image_size) rel_labels.append(rel_label.as_posix()) image_files.append(str(image_file.resolve())) (gt_root / "train.txt").write_text("\n".join(rel_labels) + "\n", encoding="utf-8") dataset = YOLOGround3DDataset( img_path=str(gt_root / "train.txt"), imgsz=list(imgsz), batch_size=1, augment=False, rect=False, stride=32, pad=0.5, prefix="test: ", task="detect", data={ "path": str(image_root), "class_map": {"car": 0}, "roi": list(roi), "ori_img_size": list(ori_img_size), "virtual_fx": 50, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, ) return dataset, image_files def test_resolve_data_yaml_for_roi_uses_default_roi(tmp_path): data_yaml = tmp_path / "mono3d_ground.yaml" write_dataset_yaml(data_yaml) resolved_path, selected_roi = resolve_data_yaml_for_roi(str(data_yaml), None) assert selected_roi == "roi0" assert resolved_path != str(data_yaml) resolved_cfg = YAML.load(resolved_path) assert resolved_cfg["roi"] == [1920, 880] assert resolved_cfg["virtual_camera_prob"] == -1.0 assert resolved_cfg["crop_center_mode"] == "cxvy" assert "default_roi" not in resolved_cfg assert "roi_configs" not in resolved_cfg def test_resolve_data_yaml_for_roi_supports_explicit_override(tmp_path): data_yaml = tmp_path / "mono3d_ground.yaml" write_dataset_yaml(data_yaml) resolved_path, selected_roi = resolve_data_yaml_for_roi(str(data_yaml), "roi1") assert selected_roi == "roi1" resolved_cfg = YAML.load(resolved_path) assert resolved_cfg["roi"] == [768, 352] assert resolved_cfg["virtual_camera_prob"] == 0.5 assert resolved_cfg["virtual_camera_val_zoom"] is True assert resolved_cfg["crop_center_mode"] == "vxvy" def test_resolve_data_yaml_for_roi_uses_unique_temp_paths(tmp_path): data_yaml = tmp_path / "mono3d_ground.yaml" write_dataset_yaml(data_yaml) resolved_path_a, selected_roi_a = resolve_data_yaml_for_roi(str(data_yaml), "roi1") resolved_path_b, selected_roi_b = resolve_data_yaml_for_roi(str(data_yaml), "roi1") assert selected_roi_a == "roi1" assert selected_roi_b == "roi1" assert resolved_path_a != resolved_path_b def test_resolve_data_yaml_for_roi_rejects_unknown_preset(tmp_path): data_yaml = tmp_path / "mono3d_ground.yaml" write_dataset_yaml(data_yaml) with pytest.raises(ValueError, match="Available presets: roi0, roi1"): resolve_data_yaml_for_roi(str(data_yaml), "roi2") def test_resolve_data_yaml_for_roi_rejects_missing_required_ground3d_fields(tmp_path): data_yaml = tmp_path / "mono3d_ground.yaml" write_dataset_yaml(data_yaml) data_cfg = YAML.load(data_yaml) del data_cfg["roi_configs"]["roi1"]["crop_center_mode"] YAML.save(data_yaml, data_cfg) with pytest.raises(ValueError, match="crop_center_mode"): resolve_data_yaml_for_roi(str(data_yaml), "roi1") def test_ground3d_dataset_resolves_gt_list_to_image_and_calib(tmp_path): gt_root = tmp_path / "gt" image_root = tmp_path / "dataset" rel_label = Path("labels/seq0/frame_0001.txt") label_file = gt_root / rel_label image_file = image_root / "images" / "seq0" / "frame_0001.png" clip_calib_dir = gt_root / "calib" / "seq0" label_file.parent.mkdir(parents=True, exist_ok=True) image_file.parent.mkdir(parents=True, exist_ok=True) label_file.write_text("car 0.5 0.5 0.25 0.25 0\n", encoding="utf-8") Image.new("RGB", (64, 32), color=(32, 64, 96)).save(image_file) write_clip_level_camera4(clip_calib_dir, (64, 32)) (gt_root / "train.txt").write_text(f"{rel_label.as_posix()}\n", encoding="utf-8") dataset = YOLOGround3DDataset( img_path=str(gt_root / "train.txt"), imgsz=[64, 32], batch_size=1, augment=False, rect=False, stride=32, pad=0.5, prefix="test: ", task="detect", data={ "path": str(image_root), "class_map": {"car": 0}, "roi": [64, 32], "ori_img_size": [64, 32], "virtual_fx": 50, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, ) assert len(dataset.labels) == 1 assert dataset.labels[0] == (str(gt_root.resolve()), rel_label.as_posix()) raw_calib = read_calib_from_path( str(image_file.resolve()), image_root=image_root, extra_calib_candidates=[str((gt_root / "calib" / "seq0" / "frame_0001.json").resolve())], ) assert raw_calib["focal_u"] == 50.0 sample = dataset.get_image_and_label(0) assert sample["im_file"] == str(image_file.resolve()) assert sample["img"].shape[:2] == (32, 64) assert sample["calib"]["fx"] == pytest.approx(50.0) def test_ground3d_dataset_prefers_label_root_calibration_over_image_root(tmp_path): gt_root = tmp_path / "gt" image_root = tmp_path / "dataset" rel_label = Path("labels/seq0/frame_0001.txt") label_file = gt_root / rel_label image_file = image_root / "images" / "seq0" / "frame_0001.png" label_calib_dir = gt_root / "calib" / "seq0" image_calib_file = image_root / "calib" / "seq0" / "frame_0001.json" label_file.parent.mkdir(parents=True, exist_ok=True) image_file.parent.mkdir(parents=True, exist_ok=True) image_calib_file.parent.mkdir(parents=True, exist_ok=True) label_file.write_text("car 0.5 0.5 0.25 0.25 0\n", encoding="utf-8") Image.new("RGB", (64, 32), color=(32, 64, 96)).save(image_file) write_clip_level_camera4(label_calib_dir, (64, 32), focal_u=80.0) image_calib_file.write_text( json.dumps({"focal_u": 50.0, "focal_v": 50.0, "cu": 32.0, "cv": 16.0, "pitch": 0.0, "distort_coeffs": []}), encoding="utf-8", ) (gt_root / "train.txt").write_text(f"{rel_label.as_posix()}\n", encoding="utf-8") dataset = YOLOGround3DDataset( img_path=str(gt_root / "train.txt"), imgsz=[64, 32], batch_size=1, augment=False, rect=False, stride=32, pad=0.5, prefix="test: ", task="detect", data={ "path": str(image_root), "class_map": {"car": 0}, "roi": [64, 32], "ori_img_size": [64, 32], "virtual_fx": 50, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, ) sample = dataset.get_image_and_label(0) assert sample["calib"]["fx"] == pytest.approx(80.0) def test_ground3d_dataset_reads_clip_level_camera4_from_label_root(tmp_path): gt_root = tmp_path / "gt_20260320" image_root = tmp_path / "dataset_20260202" rel_label = Path("seq0/clip0/labels/frame_0001.txt") label_file = gt_root / rel_label image_file = image_root / "seq0" / "clip0" / "images" / "frame_0001.png" clip_calib_file = gt_root / "seq0" / "clip0" / "calib" / "L2_calib" / "camera4.json" label_file.parent.mkdir(parents=True, exist_ok=True) image_file.parent.mkdir(parents=True, exist_ok=True) clip_calib_file.parent.mkdir(parents=True, exist_ok=True) label_file.write_text( "car 0.5 0.5 0.25 0.25 1 2 3 4 5 6 0.1 0.2 0.3 9 10 11 12 0\n", encoding="utf-8", ) Image.new("RGB", (1920, 1080), color=(32, 64, 96)).save(image_file) clip_calib_file.write_text( json.dumps( { "focal_u": 1450.9230324555967, "focal_v": 1458.0023697476843, "cu": 949.5149041625389, "cv": 569.9146363123367, "distort_coeffs": [-0.6, 0.7, -0.5, 0.2], "pitch": 0.214, "roll": 1.077, "yaw": -0.643, } ), encoding="utf-8", ) (gt_root / "train.txt").write_text(f"./{rel_label.as_posix()}\n", encoding="utf-8") raw_calib = read_calib_from_path(str(image_file.resolve()), image_root=image_root, extra_calib_candidates=[ str((gt_root / "seq0" / "clip0" / "calib" / "frame_0001.json").resolve()) ]) assert raw_calib is not None assert raw_calib["focal_u"] == pytest.approx(1450.9230324555967) assert raw_calib["pitch"] == pytest.approx(np.deg2rad(0.214)) dataset = YOLOGround3DDataset( img_path=str(gt_root / "train.txt"), imgsz=[768, 352], batch_size=1, augment=False, rect=False, stride=32, pad=0.5, prefix="test: ", task="detect", data={ "path": str(image_root), "class_map": {"car": 0}, "complete_3d_classes": [0], "roi": [768, 352], "ori_img_size": [1920, 1080], "virtual_fx": 537, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, ) sample = dataset.get_image_and_label(0) assert sample["im_file"] == str(image_file.resolve()) assert sample["calib"]["fx"] > 0 def test_ground3d_dataset_applies_class_filter_and_rect_lazily(tmp_path): gt_root = tmp_path / "gt" image_root = tmp_path / "dataset" rel_label = Path("labels/seq0/frame_0001.txt") label_file = gt_root / rel_label image_file = image_root / "images" / "seq0" / "frame_0001.png" clip_calib_dir = gt_root / "calib" / "seq0" label_file.parent.mkdir(parents=True, exist_ok=True) image_file.parent.mkdir(parents=True, exist_ok=True) label_file.write_text( "car 0.5 0.5 0.25 0.25 0\ntruck 0.4 0.4 0.2 0.2 0\n", encoding="utf-8", ) Image.new("RGB", (64, 32), color=(32, 64, 96)).save(image_file) write_clip_level_camera4(clip_calib_dir, (64, 32)) (gt_root / "train.txt").write_text(f"{rel_label.as_posix()}\n", encoding="utf-8") dataset = YOLOGround3DDataset( img_path=str(gt_root / "train.txt"), imgsz=[64, 32], batch_size=1, augment=False, rect=True, stride=32, pad=0.5, prefix="test: ", task="detect", classes=[1], data={ "path": str(image_root), "class_map": {"car": 0, "truck": 1}, "roi": [64, 32], "ori_img_size": [64, 32], "virtual_fx": 50, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, ) assert dataset.batch.shape == (1,) assert dataset.batch_shapes.shape == (1, 2) sample = dataset.get_image_and_label(0) assert sample["cls"].reshape(-1).tolist() == [1.0] def test_ground3d_dataset_keeps_missing_3d_targets_as_nan_for_2d_only_labels(tmp_path): gt_root = tmp_path / "gt" image_root = tmp_path / "dataset" rel_label = Path("labels/seq0/frame_0001.txt") label_file = gt_root / rel_label image_file = image_root / "images" / "seq0" / "frame_0001.png" clip_calib_dir = gt_root / "calib" / "seq0" label_file.parent.mkdir(parents=True, exist_ok=True) image_file.parent.mkdir(parents=True, exist_ok=True) label_file.write_text("car 0.5 0.5 0.25 0.25 1 0\n", encoding="utf-8") Image.new("RGB", (64, 32), color=(32, 64, 96)).save(image_file) write_clip_level_camera4(clip_calib_dir, (64, 32)) (gt_root / "train.txt").write_text(f"{rel_label.as_posix()}\n", encoding="utf-8") dataset = YOLOGround3DDataset( img_path=str(gt_root / "train.txt"), imgsz=[64, 32], batch_size=1, augment=False, rect=False, stride=32, pad=0.5, prefix="test: ", task="detect", data={ "path": str(image_root), "class_map": {"car": 0}, "roi": [64, 32], "ori_img_size": [64, 32], "virtual_fx": 50, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, ) raw_sample = dataset.get_image_and_label(0) assert raw_sample["labels_3d"].shape == (1, 42) assert np.isnan(raw_sample["labels_3d"]).all() sample = dataset[0] assert sample["labels_3d"].shape == (1, 42) assert sample["labels_3d"].isnan().all().item() def test_ground3d_dataset_falls_back_to_jpg_when_png_is_missing(tmp_path): gt_root = tmp_path / "gt" image_root = tmp_path / "dataset" rel_label = Path("labels/seq0/frame_0001.txt") label_file = gt_root / rel_label image_file = image_root / "images" / "seq0" / "frame_0001.jpg" clip_calib_dir = gt_root / "calib" / "seq0" label_file.parent.mkdir(parents=True, exist_ok=True) image_file.parent.mkdir(parents=True, exist_ok=True) label_file.write_text("car 0.5 0.5 0.25 0.25 0\n", encoding="utf-8") Image.new("RGB", (64, 32), color=(32, 64, 96)).save(image_file) write_clip_level_camera4(clip_calib_dir, (64, 32)) (gt_root / "train.txt").write_text(f"{rel_label.as_posix()}\n", encoding="utf-8") dataset = YOLOGround3DDataset( img_path=str(gt_root / "train.txt"), imgsz=[64, 32], batch_size=1, augment=False, rect=False, stride=32, pad=0.5, prefix="test: ", task="detect", data={ "path": str(image_root), "class_map": {"car": 0}, "roi": [64, 32], "ori_img_size": [64, 32], "virtual_fx": 50, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, ) sample = dataset.get_image_and_label(0) assert sample["im_file"] == str(image_file.resolve()) assert sample["img"].shape[:2] == (32, 64) def test_ground3d_dataset_skips_to_next_image_when_imread_fails(tmp_path, monkeypatch): dataset, image_files = create_ground3d_dataset(tmp_path, [(64, 32), (64, 32)]) original_imread = cv2.imread def fake_imread(path, flags): if str(path) == image_files[0]: return None return original_imread(path, flags) monkeypatch.setattr(cv2, "imread", fake_imread) sample = dataset[0] assert sample["im_file"] == image_files[1] assert dataset._bad_image_mask[0] def test_ground3d_dataset_allows_missing_calibration_for_2d_only_samples(tmp_path): dataset, image_files = create_ground3d_dataset( tmp_path, [(128, 64), (128, 64)], imgsz=(64, 32), roi=(64, 32), ori_img_size=(128, 64), ) first_calib = tmp_path / "gt" / "calib" / "seq1" / "L2_calib" / "camera4.json" first_calib.unlink() sample = dataset.get_image_and_label(0) assert sample["im_file"] == image_files[0] assert sample["ori_shape"] == (32, 64) assert sample["img"].shape[:2] == (32, 64) assert sample["camera_mode"] == "roi" def test_ground3d_dataset_fails_on_missing_calibration_for_3d_samples(tmp_path): gt_root = tmp_path / "gt" image_root = tmp_path / "dataset" rel_label = Path("labels/seq0/frame_0001.txt") label_file = gt_root / rel_label image_file = image_root / "images" / "seq0" / "frame_0001.png" clip_calib_file = gt_root / "calib" / "seq0" / "L2_calib" / "camera4.json" label_file.parent.mkdir(parents=True, exist_ok=True) image_file.parent.mkdir(parents=True, exist_ok=True) clip_calib_file.parent.mkdir(parents=True, exist_ok=True) # 19-col complete_3d label: class + 18 numeric fields. label_file.write_text("car 0.5 0.5 0.25 0.25 1 2 3 4 5 6 0.1 0.2 0.3 9 10 11 12 0\n", encoding="utf-8") Image.new("RGB", (64, 32), color=(32, 64, 96)).save(image_file) clip_calib_file.write_text( json.dumps({"focal_u": 50.0, "focal_v": 50.0, "cu": 32.0, "cv": 16.0, "pitch": 0.0, "distort_coeffs": []}), encoding="utf-8", ) (gt_root / "train.txt").write_text(f"{rel_label.as_posix()}\n", encoding="utf-8") clip_calib_file.unlink() dataset = YOLOGround3DDataset( img_path=str(gt_root / "train.txt"), imgsz=[64, 32], batch_size=1, augment=False, rect=False, stride=32, pad=0.5, prefix="test: ", task="detect", data={ "path": str(image_root), "class_map": {"car": 0}, "complete_3d_classes": [0], "roi": [64, 32], "ori_img_size": [64, 32], "virtual_fx": 50, "virtual_camera_prob": -1.0, "crop_center_mode": "cxvy", }, ) with pytest.raises(Ground3DCalibrationError, match="calibration file not found"): dataset[0] def test_ground3d_dataset_rejects_missing_required_ground3d_fields(tmp_path): with pytest.raises(ValueError, match="virtual_camera_prob, crop_center_mode"): YOLOGround3DDataset( img_path=str(tmp_path / "unused.txt"), imgsz=[64, 32], batch_size=1, augment=False, rect=False, stride=32, pad=0.5, prefix="test: ", task="detect", data={ "path": str(tmp_path / "dataset"), "class_map": {"car": 0}, "roi": [64, 32], "ori_img_size": [64, 32], "virtual_fx": 50, }, ) def test_ground3d_dataset_skips_images_with_invalid_decoded_shape(tmp_path): dataset, image_files = create_ground3d_dataset(tmp_path, [(32, 16), (64, 32)]) sample = dataset[0] assert sample["im_file"] == image_files[1] assert dataset._bad_image_mask[0] def test_ground3d_dataset_resizes_in_half_steps_for_quarter_scale(tmp_path, monkeypatch): dataset, _ = create_ground3d_dataset( tmp_path, [(256, 128)], imgsz=(64, 32), roi=(256, 128), ori_img_size=(256, 128), ) original_resize = cv2.resize resize_calls = [] def tracked_resize(img, dsize, *args, **kwargs): resize_calls.append(dsize) return original_resize(img, dsize, *args, **kwargs) monkeypatch.setattr(cv2, "resize", tracked_resize) sample = dataset.get_image_and_label(0) assert resize_calls == [(128, 64), (64, 32)] assert sample["img"].shape[:2] == (32, 64) assert sample["calib"]["fx"] == pytest.approx(12.5) def test_ground3d_dataset_resizes_in_half_steps_then_remainder(tmp_path, monkeypatch): dataset, _ = create_ground3d_dataset( tmp_path, [(160, 80)], imgsz=(64, 32), roi=(160, 80), ori_img_size=(160, 80), ) original_resize = cv2.resize resize_calls = [] def tracked_resize(img, dsize, *args, **kwargs): resize_calls.append(dsize) return original_resize(img, dsize, *args, **kwargs) monkeypatch.setattr(cv2, "resize", tracked_resize) sample = dataset.get_image_and_label(0) assert resize_calls == [(80, 40), (64, 32)] assert sample["img"].shape[:2] == (32, 64) assert sample["calib"]["fx"] == pytest.approx(20.0)