Files
yolov26_3d/tools/pdcl_inference/stat_aeb_clips_by_scene.py

322 lines
10 KiB
Python
Raw Normal View History

2026-06-24 09:35:46 +08:00
"""
统计 aeb_clips*.json 中各场景的 rawid 数量和 clip 数量
默认按每条 rawid 记录里的 "场景" 字段聚合输出同名 *_stats.md
示例
python tools/pdcl_inference/stat_aeb_clips_by_scene.py \
--input tools/pdcl_inference/aeb_clips-20260322152509_to_20260430005758.json
python tools/pdcl_inference/stat_aeb_clips_by_scene.py \
--input tools/pdcl_inference/aeb_clips-20260322152509_to_20260430005758.json \
--json-output /tmp/aeb_scene_stats.json
"""
import argparse
import json
from collections import OrderedDict
from pathlib import Path
from typing import Any, Iterable
FILE = Path(__file__).resolve()
DEFAULT_INPUT = FILE.parent / "aeb_clips-20260322152509_to_20260430005758.json"
def _to_str(value: Any) -> str:
if value is None:
return ""
return str(value).strip()
def _default_output_path(input_path: Path) -> Path:
return input_path.with_name(f"{input_path.stem}_stats.md")
def normalize_scene_name(scene_name: str) -> str:
"""归一化历史表格中大小写不一致的场景名。"""
aliases = {
"CBLA-cn21": "CBLA-CN21",
"CBLA-cn24": "CBLA-CN24",
}
return aliases.get(scene_name, scene_name)
def load_scenario_records(input_path: Path) -> OrderedDict[str, list[dict[str, Any]]]:
with input_path.open("r", encoding="utf-8") as file:
payload = json.load(file)
if not isinstance(payload, dict):
raise ValueError(f"输入 JSON 顶层必须是 dict实际: {type(payload).__name__}")
scenarios = payload.get("scenarios", payload)
if not isinstance(scenarios, dict):
raise ValueError("输入 JSON 的 scenarios 字段必须是 dict")
records_by_key: OrderedDict[str, list[dict[str, Any]]] = OrderedDict()
for scenario_key, records in scenarios.items():
if not isinstance(records, list):
raise ValueError(
f"场景 {scenario_key} 对应的数据必须是 list实际: {type(records).__name__}"
)
records_by_key[str(scenario_key)] = records
return records_by_key
def iter_records(records_by_key: dict[str, list[dict[str, Any]]]) -> Iterable[tuple[str, dict[str, Any]]]:
for scenario_key, records in records_by_key.items():
for record in records:
if not isinstance(record, dict):
raise ValueError(
f"场景 {scenario_key} 中的记录必须是 dict实际: {type(record).__name__}"
)
yield scenario_key, record
def get_record_scene_name(scenario_key: str, record: dict[str, Any], normalize_scene: bool) -> str:
scene_name = _to_str(record.get("场景"))
if not scene_name:
scene_name = scenario_key.split("-", 1)[0] if scenario_key else "未知场景"
return normalize_scene_name(scene_name) if normalize_scene else scene_name
def get_record_rawid(record: dict[str, Any]) -> str:
return _to_str(record.get("rawid") or record.get("raw_id"))
def get_record_clips(record: dict[str, Any], raw_id: str) -> list[str]:
raw_clips = record.get("clips", [])
if raw_clips is None:
return []
if not isinstance(raw_clips, list):
raise ValueError(f"rawid={raw_id or 'UNKNOWN'} 的 clips 字段必须是 list")
return [clip for clip in (_to_str(item) for item in raw_clips) if clip]
def build_scene_stats(
records_by_key: dict[str, list[dict[str, Any]]],
normalize_scene: bool = True,
dedupe_rawids: bool = False,
dedupe_clips: bool = False,
) -> tuple[list[dict[str, Any]], dict[str, int]]:
grouped: OrderedDict[str, dict[str, Any]] = OrderedDict()
for scenario_key, record in iter_records(records_by_key):
raw_id = get_record_rawid(record)
if not raw_id:
continue
scene_name = get_record_scene_name(scenario_key, record, normalize_scene=normalize_scene)
clips = get_record_clips(record, raw_id)
item = grouped.setdefault(
scene_name,
{
"scene": scene_name,
"rawid_count": 0,
"clip_count": 0,
"_rawids": OrderedDict(),
"_clips": OrderedDict(),
},
)
if dedupe_rawids:
if raw_id not in item["_rawids"]:
item["_rawids"][raw_id] = None
item["rawid_count"] += 1
else:
item["_rawids"][raw_id] = None
item["rawid_count"] += 1
if dedupe_clips:
for clip in clips:
if clip not in item["_clips"]:
item["_clips"][clip] = None
item["clip_count"] += 1
else:
for clip in clips:
item["_clips"][clip] = None
item["clip_count"] += len(clips)
stats = [
{
"scene": item["scene"],
"rawid_count": item["rawid_count"],
"unique_rawid_count": len(item["_rawids"]),
"clip_count": item["clip_count"],
"unique_clip_count": len(item["_clips"]),
}
for item in grouped.values()
]
stats.sort(key=lambda item: item["scene"])
unique_rawids = {
raw_id
for item in grouped.values()
for raw_id in item["_rawids"].keys()
}
unique_clips = {
clip
for item in grouped.values()
for clip in item["_clips"].keys()
}
totals = {
"scene_count": len(stats),
"rawid_count": sum(item["rawid_count"] for item in stats),
"unique_rawid_count": len(unique_rawids),
"clip_count": sum(item["clip_count"] for item in stats),
"unique_clip_count": len(unique_clips),
}
return stats, totals
def render_markdown(
input_path: Path,
stats: list[dict[str, Any]],
totals: dict[str, int],
normalize_scene: bool,
dedupe_rawids: bool,
dedupe_clips: bool,
) -> str:
lines = [
"# AEB Clips 场景统计",
"",
f"源文件:`{input_path}`",
"",
"## 统计口径",
"",
"- 按每条记录中的 `场景` 字段聚合统计。",
]
if dedupe_rawids:
lines.append("- `rawid数量` 为当前场景下去重后的 rawid 数。")
else:
lines.append("- `rawid数量` 为 rawid 记录数。")
if dedupe_clips:
lines.append("- `clip数量` 为当前场景下去重后的 clip 数。")
else:
lines.append("- `clip数量` 为各 rawid 记录中 `clips` 列表长度之和。")
if normalize_scene:
lines.extend(
[
"- 场景名大小写归并:",
" - `CBLA-cn21` 归并到 `CBLA-CN21`",
" - `CBLA-cn24` 归并到 `CBLA-CN24`",
]
)
else:
lines.append("- 未做场景名归并。")
lines.extend(
[
"",
"## 汇总",
"",
"| 指标 | 数量 |",
"|---|---:|",
f"| 场景数 | {totals['scene_count']} |",
f"| rawid数量 | {totals['rawid_count']} |",
f"| clip数量 | {totals['clip_count']} |",
"",
"## 场景统计",
"",
"| 场景 | rawid数量 | clip数量 |",
"|---|---:|---:|",
]
)
for item in stats:
lines.append(f"| {item['scene']} | {item['rawid_count']} | {item['clip_count']} |")
lines.append(f"| **TOTAL** | **{totals['rawid_count']}** | **{totals['clip_count']}** |")
lines.append("")
return "\n".join(lines)
def save_json_stats(output_path: Path, stats: list[dict[str, Any]], totals: dict[str, int]) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w", encoding="utf-8") as file:
json.dump({"summary": totals, "scenes": stats}, file, ensure_ascii=False, indent=2)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="统计 aeb_clips*.json 中各场景的 rawid 数量和 clip 数量。"
)
parser.add_argument(
"--input",
default=str(DEFAULT_INPUT),
help=f"输入 aeb_clips*.json默认: {DEFAULT_INPUT}",
)
parser.add_argument(
"--output",
default=None,
help="输出 Markdown 路径;未指定时输出到输入文件同目录的 *_stats.md",
)
parser.add_argument(
"--json-output",
default=None,
help="可选:同时输出机器可读 JSON 统计结果",
)
parser.add_argument(
"--no-normalize-scene",
action="store_true",
help="关闭历史场景名大小写归并",
)
parser.add_argument(
"--dedupe-rawids",
action="store_true",
help="按场景统计去重 rawid 数,而不是 rawid 记录数",
)
parser.add_argument(
"--dedupe-clips",
action="store_true",
help="按场景统计去重 clip 数,而不是 clips 列表长度之和",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
input_path = Path(args.input)
output_path = Path(args.output) if args.output else _default_output_path(input_path)
records_by_key = load_scenario_records(input_path)
stats, totals = build_scene_stats(
records_by_key,
normalize_scene=not args.no_normalize_scene,
dedupe_rawids=args.dedupe_rawids,
dedupe_clips=args.dedupe_clips,
)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(
render_markdown(
input_path=input_path,
stats=stats,
totals=totals,
normalize_scene=not args.no_normalize_scene,
dedupe_rawids=args.dedupe_rawids,
dedupe_clips=args.dedupe_clips,
),
encoding="utf-8",
)
if args.json_output:
save_json_stats(Path(args.json_output), stats, totals)
print(f"已保存 Markdown 统计: {output_path}")
if args.json_output:
print(f"已保存 JSON 统计: {args.json_output}")
print(
f"场景数={totals['scene_count']} rawid数量={totals['rawid_count']} "
f"clip数量={totals['clip_count']}"
)
if __name__ == "__main__":
main()