Files
HSAP/datasets/lane.embedded.bak/scripts/build_ufld_dataset.py
Chengfang Lu e72bc061c5 feat: HSAP platform v2 — modular navigation, quality review, audit log, world model simulation
Major changes:
- New frontend (platform/web/): Vite + React 18 + TypeScript + Tailwind
- 4-module navigation: 数据送标 / 模型管理 / 车队管理 / 系统管理
- Data catalog with charts (DMS/ADAS/Lane 3-tab view)
- Quality review workflow (标注质检): Good/Fine/Bad scoring with auto-advance
- Audit enhancements: batch operations, rejection categories, Feishu notifications
- Operation audit log (操作日志)
- World model simulation studio (仿真工坊)
- Dataset version management with snapshots and diff
- ADAS 7-class dataset integration (138K images organized + compressed)
- User management with Feishu integration and pagination
- CRUD/search/filter on all pages, card layout redesign
- PIL-optimized image overlay rendering
- Auto-snapshot on build, in_review workflow stage
- Removed embedded algorithm code (now in workspace)
2026-06-03 11:40:21 +08:00

273 lines
9.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Build UFLD-ready dataset under lane0_copy/DATASET from archive train_2025_03_13_mufld.
Layout:
DATASET/
images/<src_...>/...frame_XXXXXX.jpg|png
annotations/segmentation_masks/<src_...>/...frame_XXXXXX.png
list/train_gt.txt # 90% train (two columns)
list/val_gt.txt # 10% val
list/test_gt.txt # held-out labeled test
list/test.txt # image-only inference list
manifest.json
README.md
Uses hardlinks when possible (same filesystem, no extra disk for file data).
Usage:
conda activate lane_light
python build_ufld_dataset.py
python build_ufld_dataset.py --copy # physical copy instead of hardlink
"""
from __future__ import annotations
import argparse
import json
import os
import random
import shutil
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
# reuse naming rules
SCRIPT_DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(SCRIPT_DIR))
from rename_ufld_dataset import transform_dir_component, transform_filename # noqa: E402
DEFAULT_SRC = Path("/home/chengfanglu/DATA/lane0_copy/archive/train_2025_03_13_mufld")
DEFAULT_OUT = Path("/home/chengfanglu/DATA/lane0_copy/DATASET")
IMG_ROOT = "images"
LBL_ROOT = "annotations/segmentation_masks"
def transform_core_rel(rel: str) -> str:
"""Legacy path (no seg_label prefix) -> renamed relative path."""
rel = rel.lstrip("/").replace("\\", "/")
if rel.startswith("seg_label/"):
rel = rel[len("seg_label/") :]
parts = rel.split("/")
if not parts:
return rel
out = [transform_dir_component(parts[0])]
for i in range(1, len(parts)):
comp = parts[i]
out.append(
transform_filename(comp) if i == len(parts) - 1 else transform_dir_component(comp)
)
return "/".join(out)
def to_image_rel(legacy_img: str) -> str:
return f"{IMG_ROOT}/{transform_core_rel(legacy_img)}"
def to_mask_rel(legacy_mask: str) -> str:
return f"{LBL_ROOT}/{transform_core_rel(legacy_mask)}"
def parse_gt_line(line: str) -> tuple[str, str] | None:
parts = line.strip().split()
if len(parts) < 2:
return None
return parts[0].lstrip("/"), parts[1].lstrip("/")
def link_or_copy(src: Path, dst: Path, use_copy: bool) -> None:
dst.parent.mkdir(parents=True, exist_ok=True)
if dst.exists():
if dst.samefile(src):
return
raise FileExistsError(f"exists with different file: {dst}")
if use_copy:
shutil.copy2(src, dst)
else:
try:
os.link(src, dst)
except OSError:
shutil.copy2(src, dst)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--src", type=Path, default=DEFAULT_SRC)
ap.add_argument("--out", type=Path, default=DEFAULT_OUT)
ap.add_argument("--copy", action="store_true", help="Physical copy (uses ~2x disk)")
ap.add_argument("--val-ratio", type=float, default=0.1)
ap.add_argument("--seed", type=int, default=42)
args = ap.parse_args()
src_root = args.src.resolve()
out_root = args.out.resolve()
use_copy = args.copy
if not src_root.is_dir():
sys.exit(f"Source not found: {src_root}")
out_root.mkdir(parents=True, exist_ok=True)
list_dir = out_root / "list"
list_dir.mkdir(parents=True, exist_ok=True)
# --- collect pairs from manifests ---
train_val_path = src_root / "train_val_gt.txt"
test_gt_path = src_root / "test_gt.txt"
test_txt_path = src_root / "test.txt"
pairs: list[tuple[str, str]] = []
for line in train_val_path.read_text(encoding="utf-8", errors="replace").splitlines():
p = parse_gt_line(line)
if p:
pairs.append(p)
test_pairs: list[tuple[str, str]] = []
for line in test_gt_path.read_text(encoding="utf-8", errors="replace").splitlines():
p = parse_gt_line(line)
if p:
test_pairs.append(p)
test_images_only: list[str] = []
for line in test_txt_path.read_text(encoding="utf-8", errors="replace").splitlines():
p = line.strip().lstrip("/")
if p:
test_images_only.append(p)
# unique files to materialize
img_jobs: dict[str, str] = {} # legacy -> new rel
msk_jobs: dict[str, str] = {}
for img, msk in pairs + test_pairs:
img_jobs[img] = to_image_rel(img)
msk_jobs[msk] = to_mask_rel(msk)
for img in test_images_only:
img_jobs[img] = to_image_rel(img)
print(f"Link/copy {len(img_jobs)} images + {len(msk_jobs)} masks -> {out_root}", file=sys.stderr)
missing = []
linked_img = linked_msk = 0
for i, (legacy, new_rel) in enumerate(img_jobs.items()):
s, d = src_root / legacy, out_root / new_rel
if not s.is_file():
missing.append(("image", legacy))
continue
link_or_copy(s, d, use_copy)
linked_img += 1
if (i + 1) % 20000 == 0:
print(f" images {i+1}/{len(img_jobs)}", file=sys.stderr)
for i, (legacy, new_rel) in enumerate(msk_jobs.items()):
s, d = src_root / legacy, out_root / new_rel
if not s.is_file():
missing.append(("mask", legacy))
continue
link_or_copy(s, d, use_copy)
linked_msk += 1
if (i + 1) % 20000 == 0:
print(f" masks {i+1}/{len(msk_jobs)}", file=sys.stderr)
# --- train / val split (stratified by source) ---
by_src: dict[str, list[tuple[str, str]]] = defaultdict(list)
for img, msk in pairs:
by_src[img.split("/")[0]].append((to_image_rel(img), to_mask_rel(msk)))
rng = random.Random(args.seed)
train_lines: list[str] = []
val_lines: list[str] = []
for src_name in sorted(by_src.keys()):
items = by_src[src_name]
rng.shuffle(items)
n_val = max(1, int(len(items) * args.val_ratio)) if len(items) >= 10 else max(0, int(len(items) * args.val_ratio))
val_items = items[:n_val]
tr_items = items[n_val:]
for ir, mr in tr_items:
train_lines.append(f"{ir} {mr}")
for ir, mr in val_items:
val_lines.append(f"{ir} {mr}")
rng.shuffle(train_lines)
rng.shuffle(val_lines)
(list_dir / "train_gt.txt").write_text("\n".join(train_lines) + "\n", encoding="utf-8")
(list_dir / "val_gt.txt").write_text("\n".join(val_lines) + "\n", encoding="utf-8")
test_gt_lines = [f"{to_image_rel(i)} {to_mask_rel(m)}" for i, m in test_pairs]
(list_dir / "test_gt.txt").write_text("\n".join(test_gt_lines) + "\n", encoding="utf-8")
test_inf_lines = [to_image_rel(i) for i in test_images_only]
(list_dir / "test.txt").write_text("\n".join(test_inf_lines) + "\n", encoding="utf-8")
manifest = {
"created_utc": datetime.now(timezone.utc).isoformat(),
"source": str(src_root),
"output": str(out_root),
"link_mode": "copy" if use_copy else "hardlink",
"train_pairs": len(train_lines),
"val_pairs": len(val_lines),
"test_gt_pairs": len(test_gt_lines),
"test_inference_images": len(test_inf_lines),
"linked_images": linked_img,
"linked_masks": linked_msk,
"missing_files": missing[:50],
"missing_count": len(missing),
"val_ratio": args.val_ratio,
"seed": args.seed,
"ufld_data_root": str(out_root),
"ufld_train_list": "list/train_gt.txt",
}
(out_root / "manifest.json").write_text(
json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", encoding="utf-8"
)
readme = f"""# lane0_copy/DATASET — UFLD 训练包
生成自: `{src_root}`
## 目录结构
```
DATASET/
├── images/ # 原图(清晰命名)
├── annotations/segmentation_masks/ # 分割标签(与 images 镜像路径)
├── list/
│ ├── train_gt.txt # 训练({len(train_lines)} 对)
│ ├── val_gt.txt # 验证({len(val_lines)} 对)
│ ├── test_gt.txt # 有标签测试({len(test_gt_lines)} 对)
│ └── test.txt # 仅图像推理({len(test_inf_lines)} 条)
├── manifest.json
└── README.md
```
## 命名规则
- 来源目录: `src_<类型>_<设备>_<日期>`,例如 `src_cam_zxc_20250628`
- 子目录: `clip_XX` / `scene_XX` / `unit_XX` / `video_*` 等
- 帧文件: `frame_XXXXXX.jpg` / `frame_cam_<id>.jpg`(去掉 `_new` 后缀)
## UFLD 训练
```bash
cd /home/chengfanglu/DATA/BK2/UFLD
# configs/mufld_lane_culane.py 中 data_root 指向本目录
python train.py configs/mufld_lane_culane.py
```
`LaneClsDataset` 读取 `list/train_gt.txt`两列图像相对路径、mask 相对路径)。
## 说明
- 文件通过 **{'物理复制' if use_copy else '硬链接'}** 生成,节省磁盘(硬链接与 archive 共享 inode
- 有标签评测用 `list/test_gt.txt`,勿与 `list/test.txt` 混用。
"""
(out_root / "README.md").write_text(readme, encoding="utf-8")
print(json.dumps(manifest, indent=2, ensure_ascii=False))
if missing:
print(f"WARNING: {len(missing)} missing files (see manifest)", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()