feat: HSAP platform v2 — modular navigation, quality review, audit log, world model simulation
Major changes: - New frontend (platform/web/): Vite + React 18 + TypeScript + Tailwind - 4-module navigation: 数据送标 / 模型管理 / 车队管理 / 系统管理 - Data catalog with charts (DMS/ADAS/Lane 3-tab view) - Quality review workflow (标注质检): Good/Fine/Bad scoring with auto-advance - Audit enhancements: batch operations, rejection categories, Feishu notifications - Operation audit log (操作日志) - World model simulation studio (仿真工坊) - Dataset version management with snapshots and diff - ADAS 7-class dataset integration (138K images organized + compressed) - User management with Feishu integration and pagination - CRUD/search/filter on all pages, card layout redesign - PIL-optimized image overlay rendering - Auto-snapshot on build, in_review workflow stage - Removed embedded algorithm code (now in workspace)
This commit is contained in:
272
datasets/lane.embedded.bak/scripts/build_ufld_dataset.py
Normal file
272
datasets/lane.embedded.bak/scripts/build_ufld_dataset.py
Normal file
@@ -0,0 +1,272 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Build UFLD-ready dataset under lane0_copy/DATASET from archive train_2025_03_13_mufld.
|
||||
|
||||
Layout:
|
||||
DATASET/
|
||||
images/<src_...>/...frame_XXXXXX.jpg|png
|
||||
annotations/segmentation_masks/<src_...>/...frame_XXXXXX.png
|
||||
list/train_gt.txt # 90% train (two columns)
|
||||
list/val_gt.txt # 10% val
|
||||
list/test_gt.txt # held-out labeled test
|
||||
list/test.txt # image-only inference list
|
||||
manifest.json
|
||||
README.md
|
||||
|
||||
Uses hardlinks when possible (same filesystem, no extra disk for file data).
|
||||
|
||||
Usage:
|
||||
conda activate lane_light
|
||||
python build_ufld_dataset.py
|
||||
python build_ufld_dataset.py --copy # physical copy instead of hardlink
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
# reuse naming rules
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(SCRIPT_DIR))
|
||||
from rename_ufld_dataset import transform_dir_component, transform_filename # noqa: E402
|
||||
|
||||
DEFAULT_SRC = Path("/home/chengfanglu/DATA/lane0_copy/archive/train_2025_03_13_mufld")
|
||||
DEFAULT_OUT = Path("/home/chengfanglu/DATA/lane0_copy/DATASET")
|
||||
|
||||
IMG_ROOT = "images"
|
||||
LBL_ROOT = "annotations/segmentation_masks"
|
||||
|
||||
|
||||
def transform_core_rel(rel: str) -> str:
|
||||
"""Legacy path (no seg_label prefix) -> renamed relative path."""
|
||||
rel = rel.lstrip("/").replace("\\", "/")
|
||||
if rel.startswith("seg_label/"):
|
||||
rel = rel[len("seg_label/") :]
|
||||
parts = rel.split("/")
|
||||
if not parts:
|
||||
return rel
|
||||
out = [transform_dir_component(parts[0])]
|
||||
for i in range(1, len(parts)):
|
||||
comp = parts[i]
|
||||
out.append(
|
||||
transform_filename(comp) if i == len(parts) - 1 else transform_dir_component(comp)
|
||||
)
|
||||
return "/".join(out)
|
||||
|
||||
|
||||
def to_image_rel(legacy_img: str) -> str:
|
||||
return f"{IMG_ROOT}/{transform_core_rel(legacy_img)}"
|
||||
|
||||
|
||||
def to_mask_rel(legacy_mask: str) -> str:
|
||||
return f"{LBL_ROOT}/{transform_core_rel(legacy_mask)}"
|
||||
|
||||
|
||||
def parse_gt_line(line: str) -> tuple[str, str] | None:
|
||||
parts = line.strip().split()
|
||||
if len(parts) < 2:
|
||||
return None
|
||||
return parts[0].lstrip("/"), parts[1].lstrip("/")
|
||||
|
||||
|
||||
def link_or_copy(src: Path, dst: Path, use_copy: bool) -> None:
|
||||
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||
if dst.exists():
|
||||
if dst.samefile(src):
|
||||
return
|
||||
raise FileExistsError(f"exists with different file: {dst}")
|
||||
if use_copy:
|
||||
shutil.copy2(src, dst)
|
||||
else:
|
||||
try:
|
||||
os.link(src, dst)
|
||||
except OSError:
|
||||
shutil.copy2(src, dst)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--src", type=Path, default=DEFAULT_SRC)
|
||||
ap.add_argument("--out", type=Path, default=DEFAULT_OUT)
|
||||
ap.add_argument("--copy", action="store_true", help="Physical copy (uses ~2x disk)")
|
||||
ap.add_argument("--val-ratio", type=float, default=0.1)
|
||||
ap.add_argument("--seed", type=int, default=42)
|
||||
args = ap.parse_args()
|
||||
|
||||
src_root = args.src.resolve()
|
||||
out_root = args.out.resolve()
|
||||
use_copy = args.copy
|
||||
|
||||
if not src_root.is_dir():
|
||||
sys.exit(f"Source not found: {src_root}")
|
||||
|
||||
out_root.mkdir(parents=True, exist_ok=True)
|
||||
list_dir = out_root / "list"
|
||||
list_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# --- collect pairs from manifests ---
|
||||
train_val_path = src_root / "train_val_gt.txt"
|
||||
test_gt_path = src_root / "test_gt.txt"
|
||||
test_txt_path = src_root / "test.txt"
|
||||
|
||||
pairs: list[tuple[str, str]] = []
|
||||
for line in train_val_path.read_text(encoding="utf-8", errors="replace").splitlines():
|
||||
p = parse_gt_line(line)
|
||||
if p:
|
||||
pairs.append(p)
|
||||
|
||||
test_pairs: list[tuple[str, str]] = []
|
||||
for line in test_gt_path.read_text(encoding="utf-8", errors="replace").splitlines():
|
||||
p = parse_gt_line(line)
|
||||
if p:
|
||||
test_pairs.append(p)
|
||||
|
||||
test_images_only: list[str] = []
|
||||
for line in test_txt_path.read_text(encoding="utf-8", errors="replace").splitlines():
|
||||
p = line.strip().lstrip("/")
|
||||
if p:
|
||||
test_images_only.append(p)
|
||||
|
||||
# unique files to materialize
|
||||
img_jobs: dict[str, str] = {} # legacy -> new rel
|
||||
msk_jobs: dict[str, str] = {}
|
||||
for img, msk in pairs + test_pairs:
|
||||
img_jobs[img] = to_image_rel(img)
|
||||
msk_jobs[msk] = to_mask_rel(msk)
|
||||
for img in test_images_only:
|
||||
img_jobs[img] = to_image_rel(img)
|
||||
|
||||
print(f"Link/copy {len(img_jobs)} images + {len(msk_jobs)} masks -> {out_root}", file=sys.stderr)
|
||||
|
||||
missing = []
|
||||
linked_img = linked_msk = 0
|
||||
for i, (legacy, new_rel) in enumerate(img_jobs.items()):
|
||||
s, d = src_root / legacy, out_root / new_rel
|
||||
if not s.is_file():
|
||||
missing.append(("image", legacy))
|
||||
continue
|
||||
link_or_copy(s, d, use_copy)
|
||||
linked_img += 1
|
||||
if (i + 1) % 20000 == 0:
|
||||
print(f" images {i+1}/{len(img_jobs)}", file=sys.stderr)
|
||||
|
||||
for i, (legacy, new_rel) in enumerate(msk_jobs.items()):
|
||||
s, d = src_root / legacy, out_root / new_rel
|
||||
if not s.is_file():
|
||||
missing.append(("mask", legacy))
|
||||
continue
|
||||
link_or_copy(s, d, use_copy)
|
||||
linked_msk += 1
|
||||
if (i + 1) % 20000 == 0:
|
||||
print(f" masks {i+1}/{len(msk_jobs)}", file=sys.stderr)
|
||||
|
||||
# --- train / val split (stratified by source) ---
|
||||
by_src: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
||||
for img, msk in pairs:
|
||||
by_src[img.split("/")[0]].append((to_image_rel(img), to_mask_rel(msk)))
|
||||
|
||||
rng = random.Random(args.seed)
|
||||
train_lines: list[str] = []
|
||||
val_lines: list[str] = []
|
||||
for src_name in sorted(by_src.keys()):
|
||||
items = by_src[src_name]
|
||||
rng.shuffle(items)
|
||||
n_val = max(1, int(len(items) * args.val_ratio)) if len(items) >= 10 else max(0, int(len(items) * args.val_ratio))
|
||||
val_items = items[:n_val]
|
||||
tr_items = items[n_val:]
|
||||
for ir, mr in tr_items:
|
||||
train_lines.append(f"{ir} {mr}")
|
||||
for ir, mr in val_items:
|
||||
val_lines.append(f"{ir} {mr}")
|
||||
|
||||
rng.shuffle(train_lines)
|
||||
rng.shuffle(val_lines)
|
||||
|
||||
(list_dir / "train_gt.txt").write_text("\n".join(train_lines) + "\n", encoding="utf-8")
|
||||
(list_dir / "val_gt.txt").write_text("\n".join(val_lines) + "\n", encoding="utf-8")
|
||||
|
||||
test_gt_lines = [f"{to_image_rel(i)} {to_mask_rel(m)}" for i, m in test_pairs]
|
||||
(list_dir / "test_gt.txt").write_text("\n".join(test_gt_lines) + "\n", encoding="utf-8")
|
||||
|
||||
test_inf_lines = [to_image_rel(i) for i in test_images_only]
|
||||
(list_dir / "test.txt").write_text("\n".join(test_inf_lines) + "\n", encoding="utf-8")
|
||||
|
||||
manifest = {
|
||||
"created_utc": datetime.now(timezone.utc).isoformat(),
|
||||
"source": str(src_root),
|
||||
"output": str(out_root),
|
||||
"link_mode": "copy" if use_copy else "hardlink",
|
||||
"train_pairs": len(train_lines),
|
||||
"val_pairs": len(val_lines),
|
||||
"test_gt_pairs": len(test_gt_lines),
|
||||
"test_inference_images": len(test_inf_lines),
|
||||
"linked_images": linked_img,
|
||||
"linked_masks": linked_msk,
|
||||
"missing_files": missing[:50],
|
||||
"missing_count": len(missing),
|
||||
"val_ratio": args.val_ratio,
|
||||
"seed": args.seed,
|
||||
"ufld_data_root": str(out_root),
|
||||
"ufld_train_list": "list/train_gt.txt",
|
||||
}
|
||||
(out_root / "manifest.json").write_text(
|
||||
json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", encoding="utf-8"
|
||||
)
|
||||
|
||||
readme = f"""# lane0_copy/DATASET — UFLD 训练包
|
||||
|
||||
生成自: `{src_root}`
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
DATASET/
|
||||
├── images/ # 原图(清晰命名)
|
||||
├── annotations/segmentation_masks/ # 分割标签(与 images 镜像路径)
|
||||
├── list/
|
||||
│ ├── train_gt.txt # 训练({len(train_lines)} 对)
|
||||
│ ├── val_gt.txt # 验证({len(val_lines)} 对)
|
||||
│ ├── test_gt.txt # 有标签测试({len(test_gt_lines)} 对)
|
||||
│ └── test.txt # 仅图像推理({len(test_inf_lines)} 条)
|
||||
├── manifest.json
|
||||
└── README.md
|
||||
```
|
||||
|
||||
## 命名规则
|
||||
|
||||
- 来源目录: `src_<类型>_<设备>_<日期>`,例如 `src_cam_zxc_20250628`
|
||||
- 子目录: `clip_XX` / `scene_XX` / `unit_XX` / `video_*` 等
|
||||
- 帧文件: `frame_XXXXXX.jpg` / `frame_cam_<id>.jpg`(去掉 `_new` 后缀)
|
||||
|
||||
## UFLD 训练
|
||||
|
||||
```bash
|
||||
cd /home/chengfanglu/DATA/BK2/UFLD
|
||||
# configs/mufld_lane_culane.py 中 data_root 指向本目录
|
||||
python train.py configs/mufld_lane_culane.py
|
||||
```
|
||||
|
||||
`LaneClsDataset` 读取 `list/train_gt.txt`(两列:图像相对路径、mask 相对路径)。
|
||||
|
||||
## 说明
|
||||
|
||||
- 文件通过 **{'物理复制' if use_copy else '硬链接'}** 生成,节省磁盘(硬链接与 archive 共享 inode)。
|
||||
- 有标签评测用 `list/test_gt.txt`,勿与 `list/test.txt` 混用。
|
||||
"""
|
||||
(out_root / "README.md").write_text(readme, encoding="utf-8")
|
||||
|
||||
print(json.dumps(manifest, indent=2, ensure_ascii=False))
|
||||
if missing:
|
||||
print(f"WARNING: {len(missing)} missing files (see manifest)", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
96
datasets/lane.embedded.bak/scripts/build_ufld_pack.py
Normal file
96
datasets/lane.embedded.bak/scripts/build_ufld_pack.py
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Build one incremental UFLD pack: DATASET-AddBy-<engineer>-<date>
|
||||
|
||||
Wrapper around build_ufld_dataset layout logic; does not modify base DATASET/.
|
||||
|
||||
Example:
|
||||
python build_ufld_pack.py \\
|
||||
--src /path/to/archive \\
|
||||
--parent /home/chengfanglu/DATA/lane0_copy \\
|
||||
--engineer zhangsan \\
|
||||
--date 20260615
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
|
||||
|
||||
def sanitize_engineer(name: str) -> str:
|
||||
name = name.strip()
|
||||
if not name:
|
||||
raise ValueError("engineer name is empty")
|
||||
if not re.match(r"^[A-Za-z0-9_\-]+$", name):
|
||||
raise ValueError("engineer: use letters, digits, underscore, hyphen only")
|
||||
return name
|
||||
|
||||
|
||||
def pack_name(engineer: str, date: str) -> str:
|
||||
date = re.sub(r"[^0-9]", "", date)
|
||||
if len(date) != 8:
|
||||
raise ValueError("date must be YYYYMMDD (8 digits)")
|
||||
return f"DATASET-AddBy-{engineer}-{date}"
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description="Build DATASET-AddBy-<engineer>-<date> pack")
|
||||
ap.add_argument("--src", type=Path, required=True, help="archive with train_val_gt.txt")
|
||||
ap.add_argument("--parent", type=Path, default=Path("/home/chengfanglu/DATA/lane0_copy"))
|
||||
ap.add_argument("--engineer", type=str, required=True)
|
||||
ap.add_argument("--date", type=str, required=True, help="YYYYMMDD")
|
||||
ap.add_argument("--copy", action="store_true")
|
||||
ap.add_argument("--val-ratio", type=float, default=0.1)
|
||||
ap.add_argument("--seed", type=int, default=42)
|
||||
args = ap.parse_args()
|
||||
|
||||
engineer = sanitize_engineer(args.engineer)
|
||||
out_name = pack_name(engineer, args.date)
|
||||
out_root = args.parent.resolve() / out_name
|
||||
|
||||
if out_root.exists() and any(out_root.iterdir()):
|
||||
sys.exit(f"Refusing to overwrite non-empty pack: {out_root}")
|
||||
|
||||
build_script = SCRIPT_DIR / "build_ufld_dataset.py"
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(build_script),
|
||||
"--src",
|
||||
str(args.src.resolve()),
|
||||
"--out",
|
||||
str(out_root),
|
||||
"--val-ratio",
|
||||
str(args.val_ratio),
|
||||
"--seed",
|
||||
str(args.seed),
|
||||
]
|
||||
if args.copy:
|
||||
cmd.append("--copy")
|
||||
|
||||
print(f"Building pack: {out_name}", file=sys.stderr)
|
||||
subprocess.check_call(cmd)
|
||||
|
||||
# annotate manifest
|
||||
manifest_path = out_root / "manifest.json"
|
||||
if manifest_path.is_file():
|
||||
import json
|
||||
|
||||
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
|
||||
manifest["pack_name"] = out_name
|
||||
manifest["engineer"] = engineer
|
||||
manifest["pack_date"] = re.sub(r"[^0-9]", "", args.date)
|
||||
manifest["layout"] = "DATASET-AddBy-<engineer>-<date>"
|
||||
manifest_path.write_text(
|
||||
json.dumps(manifest, indent=2, ensure_ascii=False) + "\n", encoding="utf-8"
|
||||
)
|
||||
|
||||
print(f"Done: {out_root}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
210
datasets/lane.embedded.bak/scripts/merge_ufld_lists.py
Normal file
210
datasets/lane.embedded.bak/scripts/merge_ufld_lists.py
Normal file
@@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Merge UFLD list files across DATASET + DATASET-AddBy-<engineer>-<date> packs.
|
||||
|
||||
When --prefix-from-pack is set, data_root should be lane0_copy (parent of all packs).
|
||||
Each input list path must live under <pack>/list/*.txt; lines get prefixed as <pack>/images/...
|
||||
|
||||
Example:
|
||||
python merge_ufld_lists.py \\
|
||||
--data-root /home/chengfanglu/DATA/lane0_copy \\
|
||||
--prefix-from-pack \\
|
||||
--out lists_merged/train_all_v2.txt \\
|
||||
--update-registry \\
|
||||
DATASET/list/train_gt.txt \\
|
||||
DATASET-AddBy-zhangsan-20260615/list/train_gt.txt
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_gt_line(line: str) -> tuple[str, str] | None:
|
||||
parts = line.strip().split()
|
||||
if len(parts) < 2:
|
||||
return None
|
||||
img, msk = parts[0].lstrip("/"), parts[1].lstrip("/")
|
||||
return img, msk
|
||||
|
||||
|
||||
def resolve_list_path(path: Path, data_root: Path) -> Path:
|
||||
if path.is_file():
|
||||
return path.resolve()
|
||||
candidate = data_root / path
|
||||
if candidate.is_file():
|
||||
return candidate.resolve()
|
||||
sys.exit(f"list not found: {path} (also tried {candidate})")
|
||||
|
||||
|
||||
def pack_prefix_from_list(list_path: Path, data_root: Path) -> str:
|
||||
"""DATASET/list/train_gt.txt -> DATASET/ ; DATASET-AddBy-x-20260615/list/... -> same."""
|
||||
list_path = list_path.resolve()
|
||||
data_root = data_root.resolve()
|
||||
try:
|
||||
rel = list_path.relative_to(data_root)
|
||||
except ValueError:
|
||||
if list_path.parent.name == "list":
|
||||
return f"{list_path.parent.parent.name}/"
|
||||
return ""
|
||||
if len(rel.parts) >= 2 and rel.parts[1] == "list":
|
||||
return f"{rel.parts[0]}/"
|
||||
return ""
|
||||
|
||||
|
||||
def apply_pack_prefix(img: str, msk: str, prefix: str) -> tuple[str, str]:
|
||||
if not prefix:
|
||||
return img, msk
|
||||
if not img.startswith(prefix):
|
||||
img = prefix + img
|
||||
if not msk.startswith(prefix):
|
||||
msk = prefix + msk
|
||||
return img, msk
|
||||
|
||||
|
||||
def load_pairs(path: Path, prefix: str) -> list[tuple[str, str]]:
|
||||
pairs = []
|
||||
for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
|
||||
p = parse_gt_line(line)
|
||||
if p:
|
||||
pairs.append(apply_pack_prefix(p[0], p[1], prefix))
|
||||
return pairs
|
||||
|
||||
|
||||
def validate_pairs(data_root: Path, pairs: list[tuple[str, str]]) -> tuple[list[str], list[str]]:
|
||||
missing_img, missing_msk = [], []
|
||||
for img, msk in pairs:
|
||||
if not (data_root / img).is_file():
|
||||
missing_img.append(img)
|
||||
if not (data_root / msk).is_file():
|
||||
missing_msk.append(msk)
|
||||
return missing_img, missing_msk
|
||||
|
||||
|
||||
def update_registry(registry_path: Path, data_root: Path, out_rel: str, input_paths: list[Path]) -> None:
|
||||
if registry_path.is_file():
|
||||
reg = json.loads(registry_path.read_text(encoding="utf-8"))
|
||||
else:
|
||||
reg = {
|
||||
"schema": "ufld-multi-pack-v1",
|
||||
"parent_root": str(data_root),
|
||||
"base_pack": "DATASET",
|
||||
"packs": [],
|
||||
"merged_train_lists": {},
|
||||
}
|
||||
reg["parent_root"] = str(data_root)
|
||||
known = {p["name"] for p in reg.get("packs", [])}
|
||||
for lp in input_paths:
|
||||
prefix = pack_prefix_from_list(lp, data_root)
|
||||
name = prefix.rstrip("/") if prefix else lp.parent.parent.name
|
||||
if name and name not in known:
|
||||
reg.setdefault("packs", []).append(
|
||||
{"name": name, "path": name, "role": "increment" if name != "DATASET" else "baseline_v1"}
|
||||
)
|
||||
known.add(name)
|
||||
reg.setdefault("merged_train_lists", {})[Path(out_rel).name] = {
|
||||
"path": out_rel.replace("\\", "/"),
|
||||
"created_utc": datetime.now(timezone.utc).isoformat(),
|
||||
"sources": [str(resolve_list_path(p, data_root)) for p in input_paths],
|
||||
}
|
||||
registry_path.write_text(json.dumps(reg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser(description="Merge UFLD lists across DATASET / DATASET-AddBy-* packs")
|
||||
ap.add_argument(
|
||||
"--data-root",
|
||||
type=Path,
|
||||
required=True,
|
||||
help="parent dir containing DATASET and DATASET-AddBy-* (e.g. lane0_copy)",
|
||||
)
|
||||
ap.add_argument("--out", type=Path, required=True, help="output list, e.g. lists_merged/train_all_v2.txt")
|
||||
ap.add_argument("inputs", nargs="+", type=Path, help="pack list files, e.g. DATASET/list/train_gt.txt")
|
||||
ap.add_argument("--base", type=Path, default=None, help="processed first; duplicates skipped")
|
||||
ap.add_argument(
|
||||
"--prefix-from-pack",
|
||||
action="store_true",
|
||||
help="prefix each line with pack dir name inferred from input path",
|
||||
)
|
||||
ap.add_argument("--no-validate", action="store_true")
|
||||
ap.add_argument("--report", type=Path, default=None)
|
||||
ap.add_argument(
|
||||
"--update-registry",
|
||||
action="store_true",
|
||||
help="update datasets_registry.json under data-root",
|
||||
)
|
||||
args = ap.parse_args()
|
||||
|
||||
data_root = args.data_root.resolve()
|
||||
ordered: list[tuple[str, Path]] = []
|
||||
if args.base:
|
||||
ordered.append(("base", resolve_list_path(args.base, data_root)))
|
||||
for i, p in enumerate(args.inputs):
|
||||
ordered.append((f"input{i}", resolve_list_path(p, data_root)))
|
||||
|
||||
merged: list[tuple[str, str]] = []
|
||||
seen: set[str] = set()
|
||||
stats: dict = {"sources": {}}
|
||||
|
||||
for name, list_path in ordered:
|
||||
prefix = pack_prefix_from_list(list_path, data_root) if args.prefix_from_pack else ""
|
||||
added = skipped = 0
|
||||
for img, msk in load_pairs(list_path, prefix):
|
||||
if img in seen:
|
||||
skipped += 1
|
||||
continue
|
||||
seen.add(img)
|
||||
merged.append((img, msk))
|
||||
added += 1
|
||||
stats["sources"][str(list_path)] = {
|
||||
"pack_prefix": prefix,
|
||||
"added": added,
|
||||
"skipped_duplicate": skipped,
|
||||
}
|
||||
|
||||
if not args.no_validate:
|
||||
missing_img, missing_msk = validate_pairs(data_root, merged)
|
||||
stats["missing_images"] = len(missing_img)
|
||||
stats["missing_masks"] = len(missing_msk)
|
||||
if missing_img or missing_msk:
|
||||
print(f"ERROR: missing {len(missing_img)} images, {len(missing_msk)} masks", file=sys.stderr)
|
||||
for p in missing_img[:10]:
|
||||
print(" img:", p, file=sys.stderr)
|
||||
for p in missing_msk[:10]:
|
||||
print(" msk:", p, file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
out_path = args.out if args.out.is_absolute() else data_root / args.out
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
out_path.write_text("\n".join(f"{img} {msk}" for img, msk in merged) + "\n", encoding="utf-8")
|
||||
|
||||
stats["total_out"] = len(merged)
|
||||
stats["data_root"] = str(data_root)
|
||||
stats["output"] = str(out_path)
|
||||
stats["prefix_from_pack"] = args.prefix_from_pack
|
||||
stats["created_utc"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
print(json.dumps(stats, indent=2, ensure_ascii=False))
|
||||
print(f"Wrote {len(merged)} pairs -> {out_path}")
|
||||
|
||||
if args.report:
|
||||
args.report.parent.mkdir(parents=True, exist_ok=True)
|
||||
args.report.write_text(json.dumps(stats, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
||||
|
||||
if args.update_registry:
|
||||
out_rel = str(out_path.relative_to(data_root)).replace("\\", "/")
|
||||
update_registry(
|
||||
data_root / "datasets_registry.json",
|
||||
data_root,
|
||||
out_rel,
|
||||
[p for _, p in ordered],
|
||||
)
|
||||
print(f"Updated {data_root / 'datasets_registry.json'}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
325
datasets/lane.embedded.bak/scripts/rename_ufld_dataset.py
Normal file
325
datasets/lane.embedded.bak/scripts/rename_ufld_dataset.py
Normal file
@@ -0,0 +1,325 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Rename lane0_copy/UFLD assets to a clearer layout and refresh index files.
|
||||
|
||||
Conventions
|
||||
-----------
|
||||
- Top-level sources: src_<type>_<device>_<YYYYMMDD> (seg_label/ mirrors the tree)
|
||||
- Clips: clip_XX, scene_XX, unit_XX, driver_XXX_30fps, video_<id>
|
||||
- Frames: frame_XXXXXX.jpg / .png (strip legacy _new suffix)
|
||||
- Camera frames: frame_cam_<id>, frame_ts_<timestamp>
|
||||
|
||||
Usage:
|
||||
python3 rename_ufld_dataset.py --dry-run
|
||||
python3 rename_ufld_dataset.py --apply
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
UFLD_ROOT = SCRIPT_DIR.parent / "UFLD"
|
||||
|
||||
TOP_LEVEL_MAP: dict[str, str] = {
|
||||
"100HF": "src_freeway_100hf_day",
|
||||
"60HF_night": "src_freeway_60hf_night",
|
||||
"crv_lane": "src_vehicle_crv_lane",
|
||||
"culane_data": "src_culane",
|
||||
"dvr_0422_zxc": "src_dvr_zxc_20250422",
|
||||
"dvr_0424_zxc": "src_dvr_zxc_20250424",
|
||||
"dvr_0425_buick": "src_dvr_buick_20250425",
|
||||
"dvr_0503_buick": "src_dvr_buick_20250503",
|
||||
"jiqing_highway": "src_road_jiqing",
|
||||
"pic_0507_zk282": "src_cam_zk282_20250507",
|
||||
"pic_0511_zk282": "src_cam_zk282_20250511",
|
||||
"pic_0514_zk282": "src_cam_zk282_20250514",
|
||||
"pic_0613_zk282": "src_cam_zk282_20250613",
|
||||
"pic_0620_zxc": "src_cam_zxc_20250620",
|
||||
"pic_0624_zxc": "src_cam_zxc_20250624",
|
||||
"pic_0628_zxc": "src_cam_zxc_20250628",
|
||||
"pic_1009_zk282_front30dig": "src_cam_zk282_20241009_front30deg",
|
||||
"pic_1209_zk282": "src_cam_zk282_20241209",
|
||||
"pic_250211_zk282": "src_cam_zk282_20250211",
|
||||
"pic_250515_zk425": "src_cam_zk425_20250515",
|
||||
"pic_250609_zk425": "src_cam_zk425_20250609",
|
||||
"shaoyang_data": "src_road_shaoyang",
|
||||
"vil": "src_vil",
|
||||
}
|
||||
|
||||
INDEX_FILES = [
|
||||
"train_val_gt.txt",
|
||||
"test_gt.txt",
|
||||
"test.txt",
|
||||
"test.json",
|
||||
"train_val.json",
|
||||
"test_label.json",
|
||||
]
|
||||
|
||||
SKIP_BASENAMES = {
|
||||
"train_val_gt.txt",
|
||||
"test_gt.txt",
|
||||
"test.txt",
|
||||
"test.json",
|
||||
"train_val.json",
|
||||
"test_label.json",
|
||||
}
|
||||
|
||||
|
||||
def transform_dir_component(name: str) -> str:
|
||||
if name in TOP_LEVEL_MAP:
|
||||
return TOP_LEVEL_MAP[name]
|
||||
m = re.match(r"^scene(\d+)$", name, re.I)
|
||||
if m:
|
||||
return f"scene_{int(m.group(1)):02d}"
|
||||
m = re.match(r"^dvr_(\d+)$", name, re.I)
|
||||
if m:
|
||||
return f"unit_{int(m.group(1)):02d}"
|
||||
m = re.match(r"^(\d+)$", name)
|
||||
if m:
|
||||
n = int(m.group(1))
|
||||
return f"clip_{n:02d}" if n < 1000 else f"clip_{n}"
|
||||
m = re.match(r"^driver_(\d+)_30frame$", name, re.I)
|
||||
if m:
|
||||
return f"driver_{int(m.group(1)):03d}_30fps"
|
||||
if name.upper().endswith(".MP4"):
|
||||
return "video_" + name[: -len(".MP4")]
|
||||
m = re.match(r"^(\d+)_Road(\d+)_Trim(\d+)_frames$", name, re.I)
|
||||
if m:
|
||||
return f"road_{m.group(2)}_trim_{int(m.group(3)):03d}_seq_{int(m.group(1)):02d}"
|
||||
if name == "image_curve":
|
||||
return "curve"
|
||||
if re.match(r"^highway_\d+$", name):
|
||||
return "highway"
|
||||
m = re.match(r"^img_(\d+)_(\d+)_batch(\d+)$", name, re.I)
|
||||
if m:
|
||||
return f"batch_{int(m.group(3)):02d}_stream{int(m.group(2))}"
|
||||
m = re.match(r"^pic_(\d+)_([a-z]+)_batch(\d+)$", name, re.I)
|
||||
if m:
|
||||
return f"batch_{int(m.group(3)):02d}_{m.group(2)}"
|
||||
m = re.search(r"batch(\d+)", name, re.I)
|
||||
if m and ("batch" in name.lower()):
|
||||
return f"batch_{int(m.group(1)):02d}"
|
||||
return name
|
||||
|
||||
|
||||
def transform_filename(name: str) -> str:
|
||||
if name in SKIP_BASENAMES:
|
||||
return name
|
||||
base, ext = os.path.splitext(name)
|
||||
if ext == ".lines.txt":
|
||||
stem = base
|
||||
if stem.endswith("_new"):
|
||||
stem = stem[: -len("_new")]
|
||||
m = re.match(r"^(\d{5})$", stem)
|
||||
if m:
|
||||
return f"frame_{m.group(1)}.lines.txt"
|
||||
return name
|
||||
if base.endswith("_new"):
|
||||
base = base[: -len("_new")]
|
||||
m = re.match(r"^(\d+)$", base)
|
||||
if m:
|
||||
return f"frame_{int(m.group(1)):06d}{ext}"
|
||||
m = re.match(r"^camera_msg_(\d+)$", base, re.I)
|
||||
if m:
|
||||
return f"frame_cam_{m.group(1)}{ext}"
|
||||
m = re.match(r"^camera_front_6mm_(\d+)$", base, re.I)
|
||||
if m:
|
||||
return f"frame_cam_{m.group(1)}{ext}"
|
||||
m = re.match(r"^camera_+(\d+)$", base, re.I)
|
||||
if m:
|
||||
return f"frame_ts_{m.group(1)}{ext}"
|
||||
m = re.match(r"^frame_(\d+)_(\d+)$", base)
|
||||
if m:
|
||||
return f"frame_{m.group(1)}_{m.group(2)}{ext}"
|
||||
m = re.match(r"^frame_(\d+)$", base, re.I)
|
||||
if m:
|
||||
return f"frame_{int(m.group(1)):06d}{ext}"
|
||||
m = re.match(r"^(\d{5})$", base)
|
||||
if m:
|
||||
return f"frame_{m.group(1)}{ext}"
|
||||
return f"{base}{ext}"
|
||||
|
||||
|
||||
def transform_rel_path(rel: str) -> str:
|
||||
rel = rel.lstrip("/").replace("\\", "/")
|
||||
if not rel:
|
||||
return rel
|
||||
parts = rel.split("/")
|
||||
out: list[str] = []
|
||||
i = 0
|
||||
if parts[0] == "seg_label":
|
||||
out.append("seg_label")
|
||||
i = 1
|
||||
if i < len(parts):
|
||||
out.append(transform_dir_component(parts[i]))
|
||||
i += 1
|
||||
while i < len(parts):
|
||||
comp = parts[i]
|
||||
if i == len(parts) - 1:
|
||||
out.append(transform_filename(comp))
|
||||
else:
|
||||
out.append(transform_dir_component(comp))
|
||||
i += 1
|
||||
return "/".join(out)
|
||||
|
||||
|
||||
def collect_file_mappings(root: Path) -> dict[str, str]:
|
||||
mapping: dict[str, str] = {}
|
||||
for dirpath, _, files in os.walk(root):
|
||||
rel_dir = os.path.relpath(dirpath, root)
|
||||
if rel_dir == ".":
|
||||
rel_dir = ""
|
||||
for fn in files:
|
||||
if fn in SKIP_BASENAMES:
|
||||
continue
|
||||
old_rel = f"{rel_dir}/{fn}" if rel_dir else fn
|
||||
old_rel = old_rel.replace("\\", "/")
|
||||
new_rel = transform_rel_path(old_rel)
|
||||
if new_rel != old_rel:
|
||||
mapping[old_rel] = new_rel
|
||||
return mapping
|
||||
|
||||
|
||||
def apply_renames(root: Path, mapping: dict[str, str], dry_run: bool) -> tuple[int, int]:
|
||||
ok = 0
|
||||
err = 0
|
||||
# longest old paths first so nested dirs still resolve
|
||||
for old_rel in sorted(mapping.keys(), key=lambda p: (-p.count("/"), p)):
|
||||
new_rel = mapping[old_rel]
|
||||
old_abs = root / old_rel
|
||||
new_abs = root / new_rel
|
||||
if not old_abs.is_file():
|
||||
continue
|
||||
if new_abs.exists() and new_abs.resolve() != old_abs.resolve():
|
||||
print(f"COLLISION: {old_rel} -> {new_rel} (target exists)")
|
||||
err += 1
|
||||
continue
|
||||
if dry_run:
|
||||
ok += 1
|
||||
continue
|
||||
new_abs.parent.mkdir(parents=True, exist_ok=True)
|
||||
os.rename(old_abs, new_abs)
|
||||
ok += 1
|
||||
return ok, err
|
||||
|
||||
|
||||
def prune_empty_dirs(root: Path, dry_run: bool) -> int:
|
||||
removed = 0
|
||||
for dirpath, dirs, files in os.walk(root, topdown=False):
|
||||
if not dirs and not files:
|
||||
p = Path(dirpath)
|
||||
if p == root:
|
||||
continue
|
||||
if dry_run:
|
||||
removed += 1
|
||||
else:
|
||||
try:
|
||||
p.rmdir()
|
||||
removed += 1
|
||||
except OSError:
|
||||
pass
|
||||
return removed
|
||||
|
||||
|
||||
def replace_in_line(line: str, mapping: dict[str, str]) -> str:
|
||||
out = line
|
||||
# Replace longest paths first
|
||||
for old, new in sorted(mapping.items(), key=lambda kv: -len(kv[0])):
|
||||
old_slash = "/" + old
|
||||
new_slash = "/" + new
|
||||
out = out.replace(old_slash, new_slash)
|
||||
if out.startswith(old + " ") or out.startswith(old + "\t"):
|
||||
out = new + out[len(old) :]
|
||||
if out == old or out.startswith(old + "\n"):
|
||||
out = new + out[len(old) :]
|
||||
return out
|
||||
|
||||
|
||||
def update_index_files(root: Path, mapping: dict[str, str], dry_run: bool) -> None:
|
||||
slash_map = {"/" + k: "/" + v for k, v in mapping.items()}
|
||||
slash_map.update(mapping)
|
||||
for name in INDEX_FILES:
|
||||
path = root / name
|
||||
if not path.is_file():
|
||||
continue
|
||||
if name.endswith(".json"):
|
||||
text = path.read_text(encoding="utf-8", errors="replace")
|
||||
if dry_run:
|
||||
continue
|
||||
backup = path.with_suffix(path.suffix + ".bak")
|
||||
if not backup.exists():
|
||||
shutil.copy2(path, backup)
|
||||
new_text = replace_in_line(text, slash_map)
|
||||
path.write_text(new_text, encoding="utf-8")
|
||||
else:
|
||||
lines = path.read_text(encoding="utf-8", errors="replace").splitlines(keepends=True)
|
||||
new_lines = [replace_in_line(ln, slash_map) for ln in lines]
|
||||
if dry_run:
|
||||
continue
|
||||
backup = path.with_suffix(path.suffix + ".bak")
|
||||
if not backup.exists():
|
||||
shutil.copy2(path, backup)
|
||||
path.write_text("".join(new_lines), encoding="utf-8")
|
||||
|
||||
|
||||
def check_collisions(mapping: dict[str, str]) -> list[str]:
|
||||
rev: dict[str, list[str]] = defaultdict(list)
|
||||
for old, new in mapping.items():
|
||||
rev[new].append(old)
|
||||
return [f"{new} <= {olds}" for new, olds in rev.items() if len(olds) > 1]
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--root", type=Path, default=UFLD_ROOT)
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
ap.add_argument("--apply", action="store_true")
|
||||
args = ap.parse_args()
|
||||
if not args.dry_run and not args.apply:
|
||||
args.dry_run = True
|
||||
|
||||
root = args.root.resolve()
|
||||
print(f"Root: {root}")
|
||||
mapping = collect_file_mappings(root)
|
||||
print(f"File path mappings: {len(mapping)}")
|
||||
|
||||
collisions = check_collisions(mapping)
|
||||
if collisions:
|
||||
print(f"WARNING: {len(collisions)} target collisions (showing 20)")
|
||||
for c in collisions[:20]:
|
||||
print(" ", c)
|
||||
if not args.dry_run:
|
||||
raise SystemExit("Abort: fix collisions before apply")
|
||||
|
||||
ok, err = apply_renames(root, mapping, dry_run=args.dry_run)
|
||||
print(f"Renames: ok={ok} err={err} dry_run={args.dry_run}")
|
||||
|
||||
if args.apply:
|
||||
empty = prune_empty_dirs(root, dry_run=False)
|
||||
print(f"Removed {empty} empty directories")
|
||||
update_index_files(root, mapping, dry_run=False)
|
||||
meta = {
|
||||
"root": str(root),
|
||||
"files_renamed": ok,
|
||||
"mapping_count": len(mapping),
|
||||
"top_level_map": TOP_LEVEL_MAP,
|
||||
}
|
||||
(root / "rename_manifest.json").write_text(
|
||||
json.dumps({"meta": meta, "sample": dict(list(mapping.items())[:50])}, indent=2, ensure_ascii=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
print("Updated index files (backups: *.bak)")
|
||||
else:
|
||||
samples = list(mapping.items())[:8]
|
||||
for a, b in samples:
|
||||
print(f" {a}\n -> {b}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user