Files
yolov26_3d/eval_tools/model_comparison/generate_comparison_report.py
2026-06-24 09:35:46 +08:00

497 lines
22 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
自动将 comparison_report.json 转换为中文 Markdown 评测报告。
用法:
python generate_comparison_report.py <comparison_report.json 路径>
python generate_comparison_report.py <comparison_report.json 路径> --output <输出文件路径>
python generate_comparison_report.py <comparison_report.json 路径> --title "自定义标题"
python generate_comparison_report.py <comparison_report.json 路径> --background "背景说明文字"
python generate_comparison_report.py <comparison_report.json 路径> --date 2026-03-01
示例:
python generate_comparison_report.py \
evaluation_results/eval_results_common_match_comparison_cncap_yolov5s_20260228_roi0/comparison_common_matches_20260228_102849/comparison_report.json
python generate_comparison_report.py \
evaluation_results/.../comparison_report.json \
--output my_report.md \
--title "ROI1 模型对比报告"
"""
import json
import re
import argparse
import sys
from datetime import date
from pathlib import Path
# Allow importing class_config from the eval_tools root
sys.path.insert(0, str(Path(__file__).parent.parent))
from class_config import REPORT_3D_CLASS_LABELS
# ── 阈值设置 ─────────────────────────────────────────────────────────────────
# AP 差异超过此阈值才标记为"优",否则标记为"持平"
AP_TIE_THRESHOLD = 0.005 # 0.5%
METRIC_TIE_THRESHOLD = 0.005 # 用于 precision/recall/f1 的判断阈值(绝对值)
ERROR_TIE_THRESHOLD_REL = 2.0 # 3D 误差相对变化(%)小于此值视为持平
def fmt(v: float, decimals: int = 4) -> str:
return f"{v:.{decimals}f}"
def fmt_pct(v: float) -> str:
sign = "+" if v >= 0 else ""
return f"{sign}{v:.2f}%"
def fmt_diff(v: float) -> str:
sign = "+" if v >= 0 else ""
return f"{sign}{v:.4f}"
def judge(diff: float, rel: float, higher_is_better: bool = True,
abs_thr: float = AP_TIE_THRESHOLD, rel_thr: float = None,
model1_name: str = "model1", model2_name: str = "model2") -> str:
"""
根据 diff (model2 - model1) 判断哪个模型更好。
higher_is_better=True → diff>0 代表 model2 更好
higher_is_better=False → diff<0 代表 model2 更好(即误差更小)
"""
if rel_thr is not None:
tie = abs(rel) < rel_thr
else:
tie = abs(diff) < abs_thr
if tie:
return "⚖️ 持平"
m2_better = (diff > 0) if higher_is_better else (diff < 0)
m2_short = model2_name.split("-")[-1] # e.g. "cncap"
m1_short = model1_name.split("-")[-1] # e.g. "newdata", "mono3d"
if m2_better:
return f"{m2_short}"
else:
return f"{m1_short}"
def build_report(data: dict, model1: str, model2: str,
report_date: str, title: str = None, background: str = None) -> str:
"""生成完整 Markdown 报告字符串。"""
m2d = data["2d_metrics"]
m3d = data.get("3d_metrics", {})
stats = data.get("match_statistics", {})
summary = data.get("summary", {})
# ── 名称简写 ──────────────────────────────────────────────────────────────
m1_short = model1
m2_short = model2
# 取最后一段作为简称用于表格
m1_tag = model1.split("-")[-1] # e.g. "newdata"
m2_tag = model2.split("-")[-1] # e.g. "cncap"
lines = []
# ── 标题 ─────────────────────────────────────────────────────────────────
auto_title = title or f"模型对比Overall指标总结 ({model1} vs {model2} - 通用数据集评测)"
lines.append(f"# {auto_title}")
lines.append("")
lines.append(f"**对比模型**: {model1} vs {model2} ")
lines.append(f"**评测日期**: {report_date} ")
lines.append(f"**数据集**: 通用数据集 (Common Match Cases) ")
total_common = stats.get("common", None)
m1_total = stats.get("model1_total", None)
m2_total = stats.get("model2_total", None)
if total_common is not None and m1_total is not None and m2_total is not None:
lines.append(f"**匹配样本**: {total_common:,} ({model1}: {m1_total:,} | {model2}: {m2_total:,})")
else:
lines.append(f"**匹配样本**: N/A")
lines.append("")
if background:
lines.append(f"> **背景说明**: {background}")
else:
lines.append(f"> **背景说明**: 本次评测对比了 {model1}{model2}评估两者在通用数据集上的2D/3D检测性能差异。")
lines.append("")
lines.append("---")
lines.append("")
# ── 2D Overall ────────────────────────────────────────────────────────────
ov = m2d["overall"]
lines.append("## 📊 2D检测指标 (Overall)")
lines.append("")
lines.append("### 总体性能对比")
lines.append("")
lines.append(f"| 指标 | {model1} | {model2} | 差异 | 相对变化 | 结果 |")
lines.append("|------|" + "---|" * 5)
def ov_row(key, label, higher_is_better=True):
v1 = ov[key][model1]
v2 = ov[key][model2]
diff = ov[key]["diff"]
rel = ov[key]["relative_change_%"]
j = judge(diff, rel, higher_is_better, abs_thr=METRIC_TIE_THRESHOLD,
model1_name=model1, model2_name=model2)
return f"| **{label}** | {fmt(v1)} | {fmt(v2)} | {fmt_diff(diff)} | {fmt_pct(rel)} | {j} |"
lines.append(ov_row("precision", "PRECISION"))
lines.append(ov_row("recall", "RECALL"))
lines.append(ov_row("f1_score", "F1-Score"))
lines.append(ov_row("map", "mAP"))
lines.append("")
# 关键发现
prec_diff = ov["precision"]["relative_change_%"]
rec_diff = ov["recall"]["relative_change_%"]
map_diff = ov["map"]["relative_change_%"]
f1_diff = ov["f1_score"]["relative_change_%"]
ap_wins = summary.get("2d", {}).get("ap", {}).get("wins", "?")
ap_losses = summary.get("2d", {}).get("ap", {}).get("losses", "?")
ap_ties = summary.get("2d", {}).get("ap", {}).get("ties", "?")
lines.append("### 关键发现")
lines.append("")
lines.append(f"- 📊 **Precision**: {model2}{'领先' if prec_diff > 0 else '落后'}{fmt_pct(abs(prec_diff))}{'误检率略低' if prec_diff > 0 else '误检率略高'}")
lines.append(f"- 📊 **Recall**: {model1 if rec_diff < 0 else model2}领先{fmt_pct(abs(rec_diff))},检出率{'更高' if rec_diff < 0 else '更低'}")
lines.append(f"- 📊 **mAP**: {model2 if map_diff > 0 else model1}领先{fmt_pct(abs(map_diff))}{'极小差异,基本持平' if abs(map_diff) < 2 else '有一定差距'}")
lines.append(f"- 📊 **F1-Score**: {'两模型基本持平' if abs(f1_diff) < 1 else (model2 + '更优' if f1_diff > 0 else model1 + '更优')}(差距{fmt_pct(abs(f1_diff))}")
lines.append(f"- ⚖️ **类别赢负统计 (AP)**: {m2_tag}{ap_wins}类, {m1_tag}{ap_losses}类, 平局{ap_ties}")
lines.append("")
lines.append("---")
lines.append("")
# ── 2D Per-Class ──────────────────────────────────────────────────────────
pc = m2d.get("per_class", {})
lines.append("## 📋 2D检测指标 (Per Class)")
lines.append("")
lines.append("### 各类别性能对比")
lines.append("")
lines.append(f"| 类别 | Precision ({m1_tag}) | Precision ({m2_tag}) | Recall ({m1_tag}) | Recall ({m2_tag}) | F1 ({m1_tag}) | F1 ({m2_tag}) | AP ({m1_tag}) | AP ({m2_tag}) | AP差异 | 结果 |")
lines.append("|------|" + "---|" * 10)
adv_m2 = [] # model2 明显更好的类别
adv_m1 = [] # model1 明显更好的类别
for cls, cd in pc.items():
prec1 = cd["precision"][model1]
prec2 = cd["precision"][model2]
rec1 = cd["recall"][model1]
rec2 = cd["recall"][model2]
f1_1 = cd["f1_score"][model1]
f1_2 = cd["f1_score"][model2]
ap1 = cd["ap"][model1]
ap2 = cd["ap"][model2]
ap_d = cd["ap"]["diff"]
ap_r = cd["ap"]["relative_change_%"]
j = judge(ap_d, ap_r, True, abs_thr=AP_TIE_THRESHOLD,
model1_name=model1, model2_name=model2)
lines.append(
f"| **{cls}** | {fmt(prec1)} | {fmt(prec2)} | {fmt(rec1)} | {fmt(rec2)} "
f"| {fmt(f1_1)} | {fmt(f1_2)} | {fmt(ap1)} | {fmt(ap2)} | {fmt_diff(ap_d)} | {j} |"
)
if abs(ap_r) >= 2.0: # 相对变化>=2%才算显著
if ap_d > 0:
adv_m2.append((cls, ap1, ap2, ap_r))
elif ap_d < 0:
adv_m1.append((cls, ap1, ap2, ap_r))
lines.append("")
lines.append("### 类别分析")
lines.append("")
if adv_m2:
lines.append(f"**{model2} 优势类别** (AP更高):")
for cls, ap1, ap2, rel in sorted(adv_m2, key=lambda x: -x[3]):
mark = "**大幅领先**" if rel > 8 else "领先"
lines.append(f"- {cls}: {m2_tag} {fmt(ap2)} > {m1_tag} {fmt(ap1)}{mark}{fmt_pct(rel)}")
lines.append("")
if adv_m1:
lines.append(f"**{model1} 优势类别** (AP更高):")
for cls, ap1, ap2, rel in sorted(adv_m1, key=lambda x: x[3]):
mark = "**大幅领先**" if abs(rel) > 8 else "领先"
lines.append(f"- {cls}: {m1_tag} {fmt(ap1)} > {m2_tag} {fmt(ap2)}{mark}{fmt_pct(abs(rel))}")
lines.append("")
lines.append("---")
lines.append("")
# ── 3D Metrics ────────────────────────────────────────────────────────────
if m3d:
lines.append("## 🎯 3D检测指标")
lines.append("")
cls_labels = REPORT_3D_CLASS_LABELS
for cls_key, cls_label in cls_labels.items():
if cls_key not in m3d:
continue
cd = m3d[cls_key]
ov3 = cd.get("overall", {})
if not ov3:
continue
n = cd.get("common_samples")
n_str = f"{n:,} 个样本" if n is not None else "N/A 个样本"
lines.append(f"### {cls_label} - {n_str}")
lines.append("")
lines.append(f"| 指标 | {model1} | {model2} | 差异 | 相对变化 | 结果 |")
lines.append("|------|" + "---|" * 5)
def row3d(key, label, higher_is_better=False):
if key not in ov3:
return None
v1 = ov3[key][model1]["mean"]
v2 = ov3[key][model2]["mean"]
diff = ov3[key]["diff"]
rel = ov3[key]["relative_change_%"]
j = judge(diff, rel, higher_is_better,
rel_thr=ERROR_TIE_THRESHOLD_REL,
model1_name=model1, model2_name=model2)
return f"| **{label}** | {fmt(v1)} | {fmt(v2)} | {fmt_diff(diff)} | {fmt_pct(rel)} | {j} |"
for row in [
row3d("lateral_error", "Lateral Error"),
row3d("longitudinal_error", "Longitudinal Error"),
row3d("longitudinal_relative_error", "Longitudinal Relative Error"),
row3d("heading_error", "Heading Error"),
row3d("heading_error_relaxed", "Heading Error Relaxed"),
]:
if row is not None:
lines.append(row)
if "reversal_info" in ov3:
rev1 = ov3["reversal_info"][model1]
rev2 = ov3["reversal_info"][model2]
rev_j = "" + (m2_tag if rev2["percentage"] < rev1["percentage"] else m1_tag) + ""
if abs(rev1["percentage"] - rev2["percentage"]) < 0.5:
rev_j = "⚖️ 持平"
lines.append(
f"| **Reversal Cases** | {rev1['count']:,} ({rev1['percentage']:.2f}%) "
f"| {rev2['count']:,} ({rev2['percentage']:.2f}%) | - | - | {rev_j} |"
)
lines.append("")
# ── 纵向区间对比 ──────────────────────────────────────────────
def _long_sort_key(k):
stripped = k[len("long_"):].replace("m", "")
m = re.search(r'(?<=\d)-', stripped)
if m:
try:
return float(stripped[:m.start()])
except ValueError:
pass
return float('inf')
long_keys = sorted(
[k for k in cd.keys() if k.startswith("long_")],
key=_long_sort_key
)
if long_keys:
lines.append(f"#### 纵向区间对比")
lines.append("")
lines.append(
f"| 区间 | 样本数 "
f"| Lat ({m1_tag}) | Lat ({m2_tag}) | Lat Δ% "
f"| Long ({m1_tag}) | Long ({m2_tag}) | Long Δ% "
f"| LongRel ({m1_tag}) | LongRel ({m2_tag}) | LongRel Δ% "
f"| Head ({m1_tag}) | Head ({m2_tag}) | Head Δ% |"
)
lines.append("|------|" + "---|" * 13)
for rk in long_keys:
rb = cd[rk]
if not rb:
continue
def _rv(metric, model):
d = rb.get(metric, {})
if model in d:
return fmt(d[model]["mean"])
return "-"
def _rd(metric):
d = rb.get(metric, {})
rel = d.get("relative_change_%")
if rel is None:
return "-"
return fmt_pct(rel)
# sample count from any available metric
n_range = "-"
for _mk in ("lateral_error", "longitudinal_error", "heading_error"):
_md = rb.get(_mk, {})
if model1 in _md and "samples" in _md[model1]:
n_range = f"{_md[model1]['samples']:,}"
break
# range label: strip prefix and trailing 'm'
rl = rk[len("long_"):]
lines.append(
f"| **{rl}** | {n_range} "
f"| {_rv('lateral_error', model1)} | {_rv('lateral_error', model2)} | {_rd('lateral_error')} "
f"| {_rv('longitudinal_error', model1)} | {_rv('longitudinal_error', model2)} | {_rd('longitudinal_error')} "
f"| {_rv('longitudinal_relative_error', model1)} | {_rv('longitudinal_relative_error', model2)} | {_rd('longitudinal_relative_error')} "
f"| {_rv('heading_error', model1)} | {_rv('heading_error', model2)} | {_rd('heading_error')} |"
)
lines.append("")
lines.append("---")
lines.append("")
# ── Match Statistics ──────────────────────────────────────────────────────
if stats:
lines.append("## 📊 样本匹配统计")
lines.append("")
lines.append("### 整体匹配情况")
lines.append("")
lines.append("| 模型 | 总样本数 | 公共样本 | 独有样本 | 公共占比 |")
lines.append("|------|----------|----------|----------|---------|")
m1_pct = stats.get("common_percentage_of_model1", 0)
m2_pct = stats.get("common_percentage_of_model2", 0)
m1_uniq = stats.get("model1_unique", 0)
m2_uniq = stats.get("model2_unique", 0)
lines.append(f"| **{model1}** | {m1_total:,} | {total_common:,} | {m1_uniq:,} | {m1_pct:.2f}% |")
lines.append(f"| **{model2}** | {m2_total:,} | {total_common:,} | {m2_uniq:,} | {m2_pct:.2f}% |")
lines.append("")
per_cls_stats = stats.get("per_class", {})
if per_cls_stats:
lines.append("### 各类别匹配情况 (3D)")
lines.append("")
lines.append(f"| 类别 | {m1_tag}总数 | {m2_tag}总数 | 公共样本 | {m1_tag}占比 | {m2_tag}占比 |")
lines.append("|------|" + "---|" * 5)
for cls, cs in per_cls_stats.items():
lines.append(
f"| **{cls}** | {cs['model1_total']:,} | {cs['model2_total']:,} "
f"| {cs['common']:,} | {cs['common_percentage_of_model1']:.2f}% "
f"| {cs['common_percentage_of_model2']:.2f}% |"
)
lines.append("")
lines.append("---")
lines.append("")
# ── Summary / Conclusions ─────────────────────────────────────────────────
lines.append("## 🎯 结论与建议")
lines.append("")
lines.append("### 2D检测汇总")
lines.append("")
sum2d = summary.get("2d", {})
ap_w = sum2d.get("ap", {}).get("wins", 0)
ap_l = sum2d.get("ap", {}).get("losses", 0)
ap_t = sum2d.get("ap", {}).get("ties", 0)
f1_w = sum2d.get("f1_score", {}).get("wins", 0)
f1_l = sum2d.get("f1_score", {}).get("losses", 0)
f1_t = sum2d.get("f1_score", {}).get("ties", 0)
lines.append(f"- **AP 类别统计**: {m2_tag}{ap_w}类 / {m1_tag}{ap_l}类 / 平局{ap_t}")
lines.append(f"- **F1 类别统计**: {m2_tag}{f1_w}类 / {m1_tag}{f1_l}类 / 平局{f1_t}")
lines.append(f"- **整体mAP**: {model1}={fmt(ov['map'][model1])} vs {model2}={fmt(ov['map'][model2])} ({fmt_pct(ov['map']['relative_change_%'])})")
lines.append("")
if m3d:
sum3d = summary.get("3d", {})
lat_w = sum3d.get("lateral", {}).get("wins", 0)
lat_l = sum3d.get("lateral", {}).get("losses", 0)
lon_w = sum3d.get("longitudinal", {}).get("wins", 0)
lon_l = sum3d.get("longitudinal", {}).get("losses", 0)
hd_w = sum3d.get("heading", {}).get("wins", 0)
hd_l = sum3d.get("heading", {}).get("losses", 0)
lines.append("### 3D检测汇总")
lines.append("")
lines.append(f"- **横向误差 (Lateral)**: {m2_tag}{lat_w}类 / {m1_tag}{lat_l}")
lines.append(f"- **纵向误差 (Longitudinal)**: {m2_tag}{lon_w}类 / {m1_tag}{lon_l}")
lines.append(f"- **航向误差 (Heading)**: {m2_tag}{hd_w}类 / {m1_tag}{hd_l}")
lines.append("")
lines.append("### 综合建议")
lines.append("")
# 自动判断整体赢家
map_rel = ov["map"]["relative_change_%"]
if map_rel > 2:
overall_winner = model2
elif map_rel < -2:
overall_winner = model1
else:
overall_winner = None
if overall_winner:
lines.append(f"- 🏆 **综合mAP**: {overall_winner} 整体占优({fmt_pct(abs(map_rel))}")
else:
lines.append(f"- ⚖️ **综合mAP**: 两模型基本持平(差距{fmt_pct(abs(map_rel))}")
adv_summary_m2 = [(c, r) for c, *_, r in adv_m2]
adv_summary_m1 = [(c, r) for c, *_, r in adv_m1]
if adv_summary_m2:
cls_str = "".join(c for c, _ in adv_summary_m2)
lines.append(f"- ✅ **{model2} 改善**: {cls_str} 类别AP有所提升")
if adv_summary_m1:
cls_str = "".join(c for c, _ in adv_summary_m1)
lines.append(f"- ⚠️ **{model2} 退化**: {cls_str} 类别AP有所下降")
lines.append("")
return "\n".join(lines)
def main():
parser = argparse.ArgumentParser(
description="将 comparison_report.json 转换为中文 Markdown 评测报告"
)
parser.add_argument("json_path", help="comparison_report.json 的路径")
parser.add_argument("--output", "-o", default=None,
help="输出 Markdown 文件路径(默认与 JSON 同目录,文件名 COMPARISON_REPORT.md")
parser.add_argument("--title", default=None,
help="自定义报告标题")
parser.add_argument("--background", default=None,
help="背景说明文字")
parser.add_argument("--date", default=str(date.today()),
help="评测日期 (默认今天,格式 YYYY-MM-DD)")
args = parser.parse_args()
json_path = Path(args.json_path)
if not json_path.exists():
print(f"错误: 文件不存在: {json_path}", file=sys.stderr)
sys.exit(1)
with open(json_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 自动从 JSON 中读取模型名称
models = list(data["2d_metrics"]["overall"]["precision"].keys())
# 过滤掉 diff / relative_change_% 等非模型 key
skip = {"diff", "relative_change_%"}
models = [m for m in models if m not in skip]
if len(models) < 2:
print("错误: 无法从 JSON 中自动识别模型名称,请检查文件格式。", file=sys.stderr)
sys.exit(1)
model1, model2 = models[0], models[1]
print(f"模型1: {model1}")
print(f"模型2: {model2}")
report = build_report(data, model1, model2,
report_date=args.date,
title=args.title,
background=args.background)
# 输出路径
if args.output:
out_path = Path(args.output)
else:
out_path = json_path.parent / "COMPARISON_REPORT.md"
out_path.write_text(report, encoding="utf-8")
print(f"报告已生成: {out_path}")
if __name__ == "__main__":
main()