Files
HSAP/datasets/lane.embedded.bak/scripts/merge_ufld_lists.py
Chengfang Lu e72bc061c5 feat: HSAP platform v2 — modular navigation, quality review, audit log, world model simulation
Major changes:
- New frontend (platform/web/): Vite + React 18 + TypeScript + Tailwind
- 4-module navigation: 数据送标 / 模型管理 / 车队管理 / 系统管理
- Data catalog with charts (DMS/ADAS/Lane 3-tab view)
- Quality review workflow (标注质检): Good/Fine/Bad scoring with auto-advance
- Audit enhancements: batch operations, rejection categories, Feishu notifications
- Operation audit log (操作日志)
- World model simulation studio (仿真工坊)
- Dataset version management with snapshots and diff
- ADAS 7-class dataset integration (138K images organized + compressed)
- User management with Feishu integration and pagination
- CRUD/search/filter on all pages, card layout redesign
- PIL-optimized image overlay rendering
- Auto-snapshot on build, in_review workflow stage
- Removed embedded algorithm code (now in workspace)
2026-06-03 11:40:21 +08:00

211 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""
Merge UFLD list files across DATASET + DATASET-AddBy-<engineer>-<date> packs.
When --prefix-from-pack is set, data_root should be lane0_copy (parent of all packs).
Each input list path must live under <pack>/list/*.txt; lines get prefixed as <pack>/images/...
Example:
python merge_ufld_lists.py \\
--data-root /home/chengfanglu/DATA/lane0_copy \\
--prefix-from-pack \\
--out lists_merged/train_all_v2.txt \\
--update-registry \\
DATASET/list/train_gt.txt \\
DATASET-AddBy-zhangsan-20260615/list/train_gt.txt
"""
from __future__ import annotations
import argparse
import json
import sys
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
def parse_gt_line(line: str) -> tuple[str, str] | None:
parts = line.strip().split()
if len(parts) < 2:
return None
img, msk = parts[0].lstrip("/"), parts[1].lstrip("/")
return img, msk
def resolve_list_path(path: Path, data_root: Path) -> Path:
if path.is_file():
return path.resolve()
candidate = data_root / path
if candidate.is_file():
return candidate.resolve()
sys.exit(f"list not found: {path} (also tried {candidate})")
def pack_prefix_from_list(list_path: Path, data_root: Path) -> str:
"""DATASET/list/train_gt.txt -> DATASET/ ; DATASET-AddBy-x-20260615/list/... -> same."""
list_path = list_path.resolve()
data_root = data_root.resolve()
try:
rel = list_path.relative_to(data_root)
except ValueError:
if list_path.parent.name == "list":
return f"{list_path.parent.parent.name}/"
return ""
if len(rel.parts) >= 2 and rel.parts[1] == "list":
return f"{rel.parts[0]}/"
return ""
def apply_pack_prefix(img: str, msk: str, prefix: str) -> tuple[str, str]:
if not prefix:
return img, msk
if not img.startswith(prefix):
img = prefix + img
if not msk.startswith(prefix):
msk = prefix + msk
return img, msk
def load_pairs(path: Path, prefix: str) -> list[tuple[str, str]]:
pairs = []
for line in path.read_text(encoding="utf-8", errors="replace").splitlines():
p = parse_gt_line(line)
if p:
pairs.append(apply_pack_prefix(p[0], p[1], prefix))
return pairs
def validate_pairs(data_root: Path, pairs: list[tuple[str, str]]) -> tuple[list[str], list[str]]:
missing_img, missing_msk = [], []
for img, msk in pairs:
if not (data_root / img).is_file():
missing_img.append(img)
if not (data_root / msk).is_file():
missing_msk.append(msk)
return missing_img, missing_msk
def update_registry(registry_path: Path, data_root: Path, out_rel: str, input_paths: list[Path]) -> None:
if registry_path.is_file():
reg = json.loads(registry_path.read_text(encoding="utf-8"))
else:
reg = {
"schema": "ufld-multi-pack-v1",
"parent_root": str(data_root),
"base_pack": "DATASET",
"packs": [],
"merged_train_lists": {},
}
reg["parent_root"] = str(data_root)
known = {p["name"] for p in reg.get("packs", [])}
for lp in input_paths:
prefix = pack_prefix_from_list(lp, data_root)
name = prefix.rstrip("/") if prefix else lp.parent.parent.name
if name and name not in known:
reg.setdefault("packs", []).append(
{"name": name, "path": name, "role": "increment" if name != "DATASET" else "baseline_v1"}
)
known.add(name)
reg.setdefault("merged_train_lists", {})[Path(out_rel).name] = {
"path": out_rel.replace("\\", "/"),
"created_utc": datetime.now(timezone.utc).isoformat(),
"sources": [str(resolve_list_path(p, data_root)) for p in input_paths],
}
registry_path.write_text(json.dumps(reg, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
def main() -> None:
ap = argparse.ArgumentParser(description="Merge UFLD lists across DATASET / DATASET-AddBy-* packs")
ap.add_argument(
"--data-root",
type=Path,
required=True,
help="parent dir containing DATASET and DATASET-AddBy-* (e.g. lane0_copy)",
)
ap.add_argument("--out", type=Path, required=True, help="output list, e.g. lists_merged/train_all_v2.txt")
ap.add_argument("inputs", nargs="+", type=Path, help="pack list files, e.g. DATASET/list/train_gt.txt")
ap.add_argument("--base", type=Path, default=None, help="processed first; duplicates skipped")
ap.add_argument(
"--prefix-from-pack",
action="store_true",
help="prefix each line with pack dir name inferred from input path",
)
ap.add_argument("--no-validate", action="store_true")
ap.add_argument("--report", type=Path, default=None)
ap.add_argument(
"--update-registry",
action="store_true",
help="update datasets_registry.json under data-root",
)
args = ap.parse_args()
data_root = args.data_root.resolve()
ordered: list[tuple[str, Path]] = []
if args.base:
ordered.append(("base", resolve_list_path(args.base, data_root)))
for i, p in enumerate(args.inputs):
ordered.append((f"input{i}", resolve_list_path(p, data_root)))
merged: list[tuple[str, str]] = []
seen: set[str] = set()
stats: dict = {"sources": {}}
for name, list_path in ordered:
prefix = pack_prefix_from_list(list_path, data_root) if args.prefix_from_pack else ""
added = skipped = 0
for img, msk in load_pairs(list_path, prefix):
if img in seen:
skipped += 1
continue
seen.add(img)
merged.append((img, msk))
added += 1
stats["sources"][str(list_path)] = {
"pack_prefix": prefix,
"added": added,
"skipped_duplicate": skipped,
}
if not args.no_validate:
missing_img, missing_msk = validate_pairs(data_root, merged)
stats["missing_images"] = len(missing_img)
stats["missing_masks"] = len(missing_msk)
if missing_img or missing_msk:
print(f"ERROR: missing {len(missing_img)} images, {len(missing_msk)} masks", file=sys.stderr)
for p in missing_img[:10]:
print(" img:", p, file=sys.stderr)
for p in missing_msk[:10]:
print(" msk:", p, file=sys.stderr)
sys.exit(1)
out_path = args.out if args.out.is_absolute() else data_root / args.out
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text("\n".join(f"{img} {msk}" for img, msk in merged) + "\n", encoding="utf-8")
stats["total_out"] = len(merged)
stats["data_root"] = str(data_root)
stats["output"] = str(out_path)
stats["prefix_from_pack"] = args.prefix_from_pack
stats["created_utc"] = datetime.now(timezone.utc).isoformat()
print(json.dumps(stats, indent=2, ensure_ascii=False))
print(f"Wrote {len(merged)} pairs -> {out_path}")
if args.report:
args.report.parent.mkdir(parents=True, exist_ok=True)
args.report.write_text(json.dumps(stats, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
if args.update_registry:
out_rel = str(out_path.relative_to(data_root)).replace("\\", "/")
update_registry(
data_root / "datasets_registry.json",
data_root,
out_rel,
[p for _, p in ordered],
)
print(f"Updated {data_root / 'datasets_registry.json'}")
if __name__ == "__main__":
main()