#!/usr/bin/env python3 """ 自动将 comparison_report.json 转换为中文 Markdown 评测报告。 用法: python generate_comparison_report.py python generate_comparison_report.py --output <输出文件路径> python generate_comparison_report.py --title "自定义标题" python generate_comparison_report.py --background "背景说明文字" python generate_comparison_report.py --date 2026-03-01 示例: python generate_comparison_report.py \ evaluation_results/eval_results_common_match_comparison_cncap_yolov5s_20260228_roi0/comparison_common_matches_20260228_102849/comparison_report.json python generate_comparison_report.py \ evaluation_results/.../comparison_report.json \ --output my_report.md \ --title "ROI1 模型对比报告" """ import json import re import argparse import sys from datetime import date from pathlib import Path # Allow importing class_config from the eval_tools root sys.path.insert(0, str(Path(__file__).parent.parent)) from class_config import REPORT_3D_CLASS_LABELS # ── 阈值设置 ───────────────────────────────────────────────────────────────── # AP 差异超过此阈值才标记为"优",否则标记为"持平" AP_TIE_THRESHOLD = 0.005 # 0.5% METRIC_TIE_THRESHOLD = 0.005 # 用于 precision/recall/f1 的判断阈值(绝对值) ERROR_TIE_THRESHOLD_REL = 2.0 # 3D 误差相对变化(%)小于此值视为持平 def fmt(v: float, decimals: int = 4) -> str: return f"{v:.{decimals}f}" def fmt_pct(v: float) -> str: sign = "+" if v >= 0 else "" return f"{sign}{v:.2f}%" def fmt_diff(v: float) -> str: sign = "+" if v >= 0 else "" return f"{sign}{v:.4f}" def judge(diff: float, rel: float, higher_is_better: bool = True, abs_thr: float = AP_TIE_THRESHOLD, rel_thr: float = None, model1_name: str = "model1", model2_name: str = "model2") -> str: """ 根据 diff (model2 - model1) 判断哪个模型更好。 higher_is_better=True → diff>0 代表 model2 更好 higher_is_better=False → diff<0 代表 model2 更好(即误差更小) """ if rel_thr is not None: tie = abs(rel) < rel_thr else: tie = abs(diff) < abs_thr if tie: return "⚖️ 持平" m2_better = (diff > 0) if higher_is_better else (diff < 0) m2_short = model2_name.split("-")[-1] # e.g. "cncap" m1_short = model1_name.split("-")[-1] # e.g. "newdata", "mono3d" if m2_better: return f"✅ {m2_short}优" else: return f"✅ {m1_short}优" def build_report(data: dict, model1: str, model2: str, report_date: str, title: str = None, background: str = None) -> str: """生成完整 Markdown 报告字符串。""" m2d = data["2d_metrics"] m3d = data.get("3d_metrics", {}) stats = data.get("match_statistics", {}) summary = data.get("summary", {}) # ── 名称简写 ────────────────────────────────────────────────────────────── m1_short = model1 m2_short = model2 # 取最后一段作为简称用于表格 m1_tag = model1.split("-")[-1] # e.g. "newdata" m2_tag = model2.split("-")[-1] # e.g. "cncap" lines = [] # ── 标题 ───────────────────────────────────────────────────────────────── auto_title = title or f"模型对比Overall指标总结 ({model1} vs {model2} - 通用数据集评测)" lines.append(f"# {auto_title}") lines.append("") lines.append(f"**对比模型**: {model1} vs {model2} ") lines.append(f"**评测日期**: {report_date} ") lines.append(f"**数据集**: 通用数据集 (Common Match Cases) ") total_common = stats.get("common", None) m1_total = stats.get("model1_total", None) m2_total = stats.get("model2_total", None) if total_common is not None and m1_total is not None and m2_total is not None: lines.append(f"**匹配样本**: {total_common:,} ({model1}: {m1_total:,} | {model2}: {m2_total:,})") else: lines.append(f"**匹配样本**: N/A") lines.append("") if background: lines.append(f"> **背景说明**: {background}") else: lines.append(f"> **背景说明**: 本次评测对比了 {model1} 与 {model2},评估两者在通用数据集上的2D/3D检测性能差异。") lines.append("") lines.append("---") lines.append("") # ── 2D Overall ──────────────────────────────────────────────────────────── ov = m2d["overall"] lines.append("## 📊 2D检测指标 (Overall)") lines.append("") lines.append("### 总体性能对比") lines.append("") lines.append(f"| 指标 | {model1} | {model2} | 差异 | 相对变化 | 结果 |") lines.append("|------|" + "---|" * 5) def ov_row(key, label, higher_is_better=True): v1 = ov[key][model1] v2 = ov[key][model2] diff = ov[key]["diff"] rel = ov[key]["relative_change_%"] j = judge(diff, rel, higher_is_better, abs_thr=METRIC_TIE_THRESHOLD, model1_name=model1, model2_name=model2) return f"| **{label}** | {fmt(v1)} | {fmt(v2)} | {fmt_diff(diff)} | {fmt_pct(rel)} | {j} |" lines.append(ov_row("precision", "PRECISION")) lines.append(ov_row("recall", "RECALL")) lines.append(ov_row("f1_score", "F1-Score")) lines.append(ov_row("map", "mAP")) lines.append("") # 关键发现 prec_diff = ov["precision"]["relative_change_%"] rec_diff = ov["recall"]["relative_change_%"] map_diff = ov["map"]["relative_change_%"] f1_diff = ov["f1_score"]["relative_change_%"] ap_wins = summary.get("2d", {}).get("ap", {}).get("wins", "?") ap_losses = summary.get("2d", {}).get("ap", {}).get("losses", "?") ap_ties = summary.get("2d", {}).get("ap", {}).get("ties", "?") lines.append("### 关键发现") lines.append("") lines.append(f"- 📊 **Precision**: {model2}{'领先' if prec_diff > 0 else '落后'}{fmt_pct(abs(prec_diff))},{'误检率略低' if prec_diff > 0 else '误检率略高'}") lines.append(f"- 📊 **Recall**: {model1 if rec_diff < 0 else model2}领先{fmt_pct(abs(rec_diff))},检出率{'更高' if rec_diff < 0 else '更低'}") lines.append(f"- 📊 **mAP**: {model2 if map_diff > 0 else model1}领先{fmt_pct(abs(map_diff))}({'极小差异,基本持平' if abs(map_diff) < 2 else '有一定差距'})") lines.append(f"- 📊 **F1-Score**: {'两模型基本持平' if abs(f1_diff) < 1 else (model2 + '更优' if f1_diff > 0 else model1 + '更优')}(差距{fmt_pct(abs(f1_diff))})") lines.append(f"- ⚖️ **类别赢负统计 (AP)**: {m2_tag}赢{ap_wins}类, {m1_tag}赢{ap_losses}类, 平局{ap_ties}类") lines.append("") lines.append("---") lines.append("") # ── 2D Per-Class ────────────────────────────────────────────────────────── pc = m2d.get("per_class", {}) lines.append("## 📋 2D检测指标 (Per Class)") lines.append("") lines.append("### 各类别性能对比") lines.append("") lines.append(f"| 类别 | Precision ({m1_tag}) | Precision ({m2_tag}) | Recall ({m1_tag}) | Recall ({m2_tag}) | F1 ({m1_tag}) | F1 ({m2_tag}) | AP ({m1_tag}) | AP ({m2_tag}) | AP差异 | 结果 |") lines.append("|------|" + "---|" * 10) adv_m2 = [] # model2 明显更好的类别 adv_m1 = [] # model1 明显更好的类别 for cls, cd in pc.items(): prec1 = cd["precision"][model1] prec2 = cd["precision"][model2] rec1 = cd["recall"][model1] rec2 = cd["recall"][model2] f1_1 = cd["f1_score"][model1] f1_2 = cd["f1_score"][model2] ap1 = cd["ap"][model1] ap2 = cd["ap"][model2] ap_d = cd["ap"]["diff"] ap_r = cd["ap"]["relative_change_%"] j = judge(ap_d, ap_r, True, abs_thr=AP_TIE_THRESHOLD, model1_name=model1, model2_name=model2) lines.append( f"| **{cls}** | {fmt(prec1)} | {fmt(prec2)} | {fmt(rec1)} | {fmt(rec2)} " f"| {fmt(f1_1)} | {fmt(f1_2)} | {fmt(ap1)} | {fmt(ap2)} | {fmt_diff(ap_d)} | {j} |" ) if abs(ap_r) >= 2.0: # 相对变化>=2%才算显著 if ap_d > 0: adv_m2.append((cls, ap1, ap2, ap_r)) elif ap_d < 0: adv_m1.append((cls, ap1, ap2, ap_r)) lines.append("") lines.append("### 类别分析") lines.append("") if adv_m2: lines.append(f"**{model2} 优势类别** (AP更高):") for cls, ap1, ap2, rel in sorted(adv_m2, key=lambda x: -x[3]): mark = "**大幅领先**" if rel > 8 else "领先" lines.append(f"- {cls}: {m2_tag} {fmt(ap2)} > {m1_tag} {fmt(ap1)}({mark}{fmt_pct(rel)})") lines.append("") if adv_m1: lines.append(f"**{model1} 优势类别** (AP更高):") for cls, ap1, ap2, rel in sorted(adv_m1, key=lambda x: x[3]): mark = "**大幅领先**" if abs(rel) > 8 else "领先" lines.append(f"- {cls}: {m1_tag} {fmt(ap1)} > {m2_tag} {fmt(ap2)}({mark}{fmt_pct(abs(rel))})") lines.append("") lines.append("---") lines.append("") # ── 3D Metrics ──────────────────────────────────────────────────────────── if m3d: lines.append("## 🎯 3D检测指标") lines.append("") cls_labels = REPORT_3D_CLASS_LABELS for cls_key, cls_label in cls_labels.items(): if cls_key not in m3d: continue cd = m3d[cls_key] ov3 = cd.get("overall", {}) if not ov3: continue n = cd.get("common_samples") n_str = f"{n:,} 个样本" if n is not None else "N/A 个样本" lines.append(f"### {cls_label} - {n_str}") lines.append("") lines.append(f"| 指标 | {model1} | {model2} | 差异 | 相对变化 | 结果 |") lines.append("|------|" + "---|" * 5) def row3d(key, label, higher_is_better=False): if key not in ov3: return None v1 = ov3[key][model1]["mean"] v2 = ov3[key][model2]["mean"] diff = ov3[key]["diff"] rel = ov3[key]["relative_change_%"] j = judge(diff, rel, higher_is_better, rel_thr=ERROR_TIE_THRESHOLD_REL, model1_name=model1, model2_name=model2) return f"| **{label}** | {fmt(v1)} | {fmt(v2)} | {fmt_diff(diff)} | {fmt_pct(rel)} | {j} |" for row in [ row3d("lateral_error", "Lateral Error"), row3d("longitudinal_error", "Longitudinal Error"), row3d("longitudinal_relative_error", "Longitudinal Relative Error"), row3d("heading_error", "Heading Error"), row3d("heading_error_relaxed", "Heading Error Relaxed"), ]: if row is not None: lines.append(row) if "reversal_info" in ov3: rev1 = ov3["reversal_info"][model1] rev2 = ov3["reversal_info"][model2] rev_j = "✅ " + (m2_tag if rev2["percentage"] < rev1["percentage"] else m1_tag) + "优" if abs(rev1["percentage"] - rev2["percentage"]) < 0.5: rev_j = "⚖️ 持平" lines.append( f"| **Reversal Cases** | {rev1['count']:,} ({rev1['percentage']:.2f}%) " f"| {rev2['count']:,} ({rev2['percentage']:.2f}%) | - | - | {rev_j} |" ) lines.append("") # ── 纵向区间对比 ────────────────────────────────────────────── def _long_sort_key(k): stripped = k[len("long_"):].replace("m", "") m = re.search(r'(?<=\d)-', stripped) if m: try: return float(stripped[:m.start()]) except ValueError: pass return float('inf') long_keys = sorted( [k for k in cd.keys() if k.startswith("long_")], key=_long_sort_key ) if long_keys: lines.append(f"#### 纵向区间对比") lines.append("") lines.append( f"| 区间 | 样本数 " f"| Lat ({m1_tag}) | Lat ({m2_tag}) | Lat Δ% " f"| Long ({m1_tag}) | Long ({m2_tag}) | Long Δ% " f"| LongRel ({m1_tag}) | LongRel ({m2_tag}) | LongRel Δ% " f"| Head ({m1_tag}) | Head ({m2_tag}) | Head Δ% |" ) lines.append("|------|" + "---|" * 13) for rk in long_keys: rb = cd[rk] if not rb: continue def _rv(metric, model): d = rb.get(metric, {}) if model in d: return fmt(d[model]["mean"]) return "-" def _rd(metric): d = rb.get(metric, {}) rel = d.get("relative_change_%") if rel is None: return "-" return fmt_pct(rel) # sample count from any available metric n_range = "-" for _mk in ("lateral_error", "longitudinal_error", "heading_error"): _md = rb.get(_mk, {}) if model1 in _md and "samples" in _md[model1]: n_range = f"{_md[model1]['samples']:,}" break # range label: strip prefix and trailing 'm' rl = rk[len("long_"):] lines.append( f"| **{rl}** | {n_range} " f"| {_rv('lateral_error', model1)} | {_rv('lateral_error', model2)} | {_rd('lateral_error')} " f"| {_rv('longitudinal_error', model1)} | {_rv('longitudinal_error', model2)} | {_rd('longitudinal_error')} " f"| {_rv('longitudinal_relative_error', model1)} | {_rv('longitudinal_relative_error', model2)} | {_rd('longitudinal_relative_error')} " f"| {_rv('heading_error', model1)} | {_rv('heading_error', model2)} | {_rd('heading_error')} |" ) lines.append("") lines.append("---") lines.append("") # ── Match Statistics ────────────────────────────────────────────────────── if stats: lines.append("## 📊 样本匹配统计") lines.append("") lines.append("### 整体匹配情况") lines.append("") lines.append("| 模型 | 总样本数 | 公共样本 | 独有样本 | 公共占比 |") lines.append("|------|----------|----------|----------|---------|") m1_pct = stats.get("common_percentage_of_model1", 0) m2_pct = stats.get("common_percentage_of_model2", 0) m1_uniq = stats.get("model1_unique", 0) m2_uniq = stats.get("model2_unique", 0) lines.append(f"| **{model1}** | {m1_total:,} | {total_common:,} | {m1_uniq:,} | {m1_pct:.2f}% |") lines.append(f"| **{model2}** | {m2_total:,} | {total_common:,} | {m2_uniq:,} | {m2_pct:.2f}% |") lines.append("") per_cls_stats = stats.get("per_class", {}) if per_cls_stats: lines.append("### 各类别匹配情况 (3D)") lines.append("") lines.append(f"| 类别 | {m1_tag}总数 | {m2_tag}总数 | 公共样本 | {m1_tag}占比 | {m2_tag}占比 |") lines.append("|------|" + "---|" * 5) for cls, cs in per_cls_stats.items(): lines.append( f"| **{cls}** | {cs['model1_total']:,} | {cs['model2_total']:,} " f"| {cs['common']:,} | {cs['common_percentage_of_model1']:.2f}% " f"| {cs['common_percentage_of_model2']:.2f}% |" ) lines.append("") lines.append("---") lines.append("") # ── Summary / Conclusions ───────────────────────────────────────────────── lines.append("## 🎯 结论与建议") lines.append("") lines.append("### 2D检测汇总") lines.append("") sum2d = summary.get("2d", {}) ap_w = sum2d.get("ap", {}).get("wins", 0) ap_l = sum2d.get("ap", {}).get("losses", 0) ap_t = sum2d.get("ap", {}).get("ties", 0) f1_w = sum2d.get("f1_score", {}).get("wins", 0) f1_l = sum2d.get("f1_score", {}).get("losses", 0) f1_t = sum2d.get("f1_score", {}).get("ties", 0) lines.append(f"- **AP 类别统计**: {m2_tag}赢{ap_w}类 / {m1_tag}赢{ap_l}类 / 平局{ap_t}类") lines.append(f"- **F1 类别统计**: {m2_tag}赢{f1_w}类 / {m1_tag}赢{f1_l}类 / 平局{f1_t}类") lines.append(f"- **整体mAP**: {model1}={fmt(ov['map'][model1])} vs {model2}={fmt(ov['map'][model2])} ({fmt_pct(ov['map']['relative_change_%'])})") lines.append("") if m3d: sum3d = summary.get("3d", {}) lat_w = sum3d.get("lateral", {}).get("wins", 0) lat_l = sum3d.get("lateral", {}).get("losses", 0) lon_w = sum3d.get("longitudinal", {}).get("wins", 0) lon_l = sum3d.get("longitudinal", {}).get("losses", 0) hd_w = sum3d.get("heading", {}).get("wins", 0) hd_l = sum3d.get("heading", {}).get("losses", 0) lines.append("### 3D检测汇总") lines.append("") lines.append(f"- **横向误差 (Lateral)**: {m2_tag}优{lat_w}类 / {m1_tag}优{lat_l}类") lines.append(f"- **纵向误差 (Longitudinal)**: {m2_tag}优{lon_w}类 / {m1_tag}优{lon_l}类") lines.append(f"- **航向误差 (Heading)**: {m2_tag}优{hd_w}类 / {m1_tag}优{hd_l}类") lines.append("") lines.append("### 综合建议") lines.append("") # 自动判断整体赢家 map_rel = ov["map"]["relative_change_%"] if map_rel > 2: overall_winner = model2 elif map_rel < -2: overall_winner = model1 else: overall_winner = None if overall_winner: lines.append(f"- 🏆 **综合mAP**: {overall_winner} 整体占优({fmt_pct(abs(map_rel))})") else: lines.append(f"- ⚖️ **综合mAP**: 两模型基本持平(差距{fmt_pct(abs(map_rel))})") adv_summary_m2 = [(c, r) for c, *_, r in adv_m2] adv_summary_m1 = [(c, r) for c, *_, r in adv_m1] if adv_summary_m2: cls_str = "、".join(c for c, _ in adv_summary_m2) lines.append(f"- ✅ **{model2} 改善**: {cls_str} 类别AP有所提升") if adv_summary_m1: cls_str = "、".join(c for c, _ in adv_summary_m1) lines.append(f"- ⚠️ **{model2} 退化**: {cls_str} 类别AP有所下降") lines.append("") return "\n".join(lines) def main(): parser = argparse.ArgumentParser( description="将 comparison_report.json 转换为中文 Markdown 评测报告" ) parser.add_argument("json_path", help="comparison_report.json 的路径") parser.add_argument("--output", "-o", default=None, help="输出 Markdown 文件路径(默认与 JSON 同目录,文件名 COMPARISON_REPORT.md)") parser.add_argument("--title", default=None, help="自定义报告标题") parser.add_argument("--background", default=None, help="背景说明文字") parser.add_argument("--date", default=str(date.today()), help="评测日期 (默认今天,格式 YYYY-MM-DD)") args = parser.parse_args() json_path = Path(args.json_path) if not json_path.exists(): print(f"错误: 文件不存在: {json_path}", file=sys.stderr) sys.exit(1) with open(json_path, "r", encoding="utf-8") as f: data = json.load(f) # 自动从 JSON 中读取模型名称 models = list(data["2d_metrics"]["overall"]["precision"].keys()) # 过滤掉 diff / relative_change_% 等非模型 key skip = {"diff", "relative_change_%"} models = [m for m in models if m not in skip] if len(models) < 2: print("错误: 无法从 JSON 中自动识别模型名称,请检查文件格式。", file=sys.stderr) sys.exit(1) model1, model2 = models[0], models[1] print(f"模型1: {model1}") print(f"模型2: {model2}") report = build_report(data, model1, model2, report_date=args.date, title=args.title, background=args.background) # 输出路径 if args.output: out_path = Path(args.output) else: out_path = json_path.parent / "COMPARISON_REPORT.md" out_path.write_text(report, encoding="utf-8") print(f"报告已生成: {out_path}") if __name__ == "__main__": main()