#!/usr/bin/env python3 """ Build UFLD-ready dataset under lane0_copy/DATASET from archive train_2025_03_13_mufld. Layout: DATASET/ images//...frame_XXXXXX.jpg|png annotations/segmentation_masks//...frame_XXXXXX.png list/train_gt.txt # 90% train (two columns) list/val_gt.txt # 10% val list/test_gt.txt # held-out labeled test list/test.txt # image-only inference list manifest.json README.md Uses hardlinks when possible (same filesystem, no extra disk for file data). Usage: conda activate lane_light python build_ufld_dataset.py python build_ufld_dataset.py --copy # physical copy instead of hardlink """ from __future__ import annotations import argparse import json import os import random import shutil import sys from collections import defaultdict from datetime import datetime, timezone from pathlib import Path # reuse naming rules SCRIPT_DIR = Path(__file__).resolve().parent sys.path.insert(0, str(SCRIPT_DIR)) from rename_ufld_dataset import transform_dir_component, transform_filename # noqa: E402 DEFAULT_SRC = Path("/home/chengfanglu/DATA/lane0_copy/archive/train_2025_03_13_mufld") DEFAULT_OUT = Path("/home/chengfanglu/DATA/lane0_copy/DATASET") IMG_ROOT = "images" LBL_ROOT = "annotations/segmentation_masks" def transform_core_rel(rel: str) -> str: """Legacy path (no seg_label prefix) -> renamed relative path.""" rel = rel.lstrip("/").replace("\\", "/") if rel.startswith("seg_label/"): rel = rel[len("seg_label/") :] parts = rel.split("/") if not parts: return rel out = [transform_dir_component(parts[0])] for i in range(1, len(parts)): comp = parts[i] out.append( transform_filename(comp) if i == len(parts) - 1 else transform_dir_component(comp) ) return "/".join(out) def to_image_rel(legacy_img: str) -> str: return f"{IMG_ROOT}/{transform_core_rel(legacy_img)}" def to_mask_rel(legacy_mask: str) -> str: return f"{LBL_ROOT}/{transform_core_rel(legacy_mask)}" def parse_gt_line(line: str) -> tuple[str, str] | None: parts = line.strip().split() if len(parts) < 2: return None return parts[0].lstrip("/"), parts[1].lstrip("/") def link_or_copy(src: Path, dst: Path, use_copy: bool) -> None: dst.parent.mkdir(parents=True, exist_ok=True) if dst.exists(): if dst.samefile(src): return raise FileExistsError(f"exists with different file: {dst}") if use_copy: shutil.copy2(src, dst) else: try: os.link(src, dst) except OSError: shutil.copy2(src, dst) def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--src", type=Path, default=DEFAULT_SRC) ap.add_argument("--out", type=Path, default=DEFAULT_OUT) ap.add_argument("--copy", action="store_true", help="Physical copy (uses ~2x disk)") ap.add_argument("--val-ratio", type=float, default=0.1) ap.add_argument("--seed", type=int, default=42) args = ap.parse_args() src_root = args.src.resolve() out_root = args.out.resolve() use_copy = args.copy if not src_root.is_dir(): sys.exit(f"Source not found: {src_root}") out_root.mkdir(parents=True, exist_ok=True) list_dir = out_root / "list" list_dir.mkdir(parents=True, exist_ok=True) # --- collect pairs from manifests --- train_val_path = src_root / "train_val_gt.txt" test_gt_path = src_root / "test_gt.txt" test_txt_path = src_root / "test.txt" pairs: list[tuple[str, str]] = [] for line in train_val_path.read_text(encoding="utf-8", errors="replace").splitlines(): p = parse_gt_line(line) if p: pairs.append(p) test_pairs: list[tuple[str, str]] = [] for line in test_gt_path.read_text(encoding="utf-8", errors="replace").splitlines(): p = parse_gt_line(line) if p: test_pairs.append(p) test_images_only: list[str] = [] for line in test_txt_path.read_text(encoding="utf-8", errors="replace").splitlines(): p = line.strip().lstrip("/") if p: test_images_only.append(p) # unique files to materialize img_jobs: dict[str, str] = {} # legacy -> new rel msk_jobs: dict[str, str] = {} for img, msk in pairs + test_pairs: img_jobs[img] = to_image_rel(img) msk_jobs[msk] = to_mask_rel(msk) for img in test_images_only: img_jobs[img] = to_image_rel(img) print(f"Link/copy {len(img_jobs)} images + {len(msk_jobs)} masks -> {out_root}", file=sys.stderr) missing = [] linked_img = linked_msk = 0 for i, (legacy, new_rel) in enumerate(img_jobs.items()): s, d = src_root / legacy, out_root / new_rel if not s.is_file(): missing.append(("image", legacy)) continue link_or_copy(s, d, use_copy) linked_img += 1 if (i + 1) % 20000 == 0: print(f" images {i+1}/{len(img_jobs)}", file=sys.stderr) for i, (legacy, new_rel) in enumerate(msk_jobs.items()): s, d = src_root / legacy, out_root / new_rel if not s.is_file(): missing.append(("mask", legacy)) continue link_or_copy(s, d, use_copy) linked_msk += 1 if (i + 1) % 20000 == 0: print(f" masks {i+1}/{len(msk_jobs)}", file=sys.stderr) # --- train / val split (stratified by source) --- by_src: dict[str, list[tuple[str, str]]] = defaultdict(list) for img, msk in pairs: by_src[img.split("/")[0]].append((to_image_rel(img), to_mask_rel(msk))) rng = random.Random(args.seed) train_lines: list[str] = [] val_lines: list[str] = [] for src_name in sorted(by_src.keys()): items = by_src[src_name] rng.shuffle(items) n_val = max(1, int(len(items) * args.val_ratio)) if len(items) >= 10 else max(0, int(len(items) * args.val_ratio)) val_items = items[:n_val] tr_items = items[n_val:] for ir, mr in tr_items: train_lines.append(f"{ir} {mr}") for ir, mr in val_items: val_lines.append(f"{ir} {mr}") rng.shuffle(train_lines) rng.shuffle(val_lines) (list_dir / "train_gt.txt").write_text("\n".join(train_lines) + "\n", encoding="utf-8") (list_dir / "val_gt.txt").write_text("\n".join(val_lines) + "\n", encoding="utf-8") test_gt_lines = [f"{to_image_rel(i)} {to_mask_rel(m)}" for i, m in test_pairs] (list_dir / "test_gt.txt").write_text("\n".join(test_gt_lines) + "\n", encoding="utf-8") test_inf_lines = [to_image_rel(i) for i in test_images_only] (list_dir / "test.txt").write_text("\n".join(test_inf_lines) + "\n", encoding="utf-8") manifest = { "created_utc": datetime.now(timezone.utc).isoformat(), "source": str(src_root), "output": str(out_root), "link_mode": "copy" if use_copy else "hardlink", "train_pairs": len(train_lines), "val_pairs": len(val_lines), "test_gt_pairs": len(test_gt_lines), "test_inference_images": len(test_inf_lines), "linked_images": linked_img, "linked_masks": linked_msk, "missing_files": missing[:50], "missing_count": len(missing), "val_ratio": args.val_ratio, "seed": args.seed, "ufld_data_root": str(out_root), "ufld_train_list": "list/train_gt.txt", } (out_root / "manifest.json").write_text( json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", encoding="utf-8" ) readme = f"""# lane0_copy/DATASET — UFLD 训练包 生成自: `{src_root}` ## 目录结构 ``` DATASET/ ├── images/ # 原图(清晰命名) ├── annotations/segmentation_masks/ # 分割标签(与 images 镜像路径) ├── list/ │ ├── train_gt.txt # 训练({len(train_lines)} 对) │ ├── val_gt.txt # 验证({len(val_lines)} 对) │ ├── test_gt.txt # 有标签测试({len(test_gt_lines)} 对) │ └── test.txt # 仅图像推理({len(test_inf_lines)} 条) ├── manifest.json └── README.md ``` ## 命名规则 - 来源目录: `src_<类型>_<设备>_<日期>`,例如 `src_cam_zxc_20250628` - 子目录: `clip_XX` / `scene_XX` / `unit_XX` / `video_*` 等 - 帧文件: `frame_XXXXXX.jpg` / `frame_cam_.jpg`(去掉 `_new` 后缀) ## UFLD 训练 ```bash cd /home/chengfanglu/DATA/BK2/UFLD # configs/mufld_lane_culane.py 中 data_root 指向本目录 python train.py configs/mufld_lane_culane.py ``` `LaneClsDataset` 读取 `list/train_gt.txt`(两列:图像相对路径、mask 相对路径)。 ## 说明 - 文件通过 **{'物理复制' if use_copy else '硬链接'}** 生成,节省磁盘(硬链接与 archive 共享 inode)。 - 有标签评测用 `list/test_gt.txt`,勿与 `list/test.txt` 混用。 """ (out_root / "README.md").write_text(readme, encoding="utf-8") print(json.dumps(manifest, indent=2, ensure_ascii=False)) if missing: print(f"WARNING: {len(missing)} missing files (see manifest)", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()