Files
yolov26_3d/eval_tools/model_comparison/compare_per_case_2d.py
2026-06-24 09:35:46 +08:00

286 lines
11 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Per-Case 2D Metrics Comparison Tool
This script compares per_case_2d metrics from two model evaluation reports
and identifies cases with significant metric differences.
Usage:
python eval_tools/model_comparison/compare_per_case_2d.py \
--model1 evaluation_results/.../evaluation_report.json \
--model2 evaluation_results/.../evaluation_report.json \
--threshold 0.1 \
--output comparison_per_case_2d.json
Example:
python eval_tools/model_comparison/compare_per_case_2d.py \
--model1 evaluation_results/eval_results_common_match_comparison_CNCAP_roi0/mono3d/20260211_113153/evaluation_report.json \
--model2 evaluation_results/eval_results_common_match_comparison_CNCAP_roi0/yolov5s-300w-newdata/20260211_113153/evaluation_report.json \
--threshold 0.1 \
--output per_case_comparison.json
"""
import argparse
import json
import os
from pathlib import Path
from collections import defaultdict
import sys
import numpy as np
# Allow importing class_config from the eval_tools root
sys.path.insert(0, str(Path(__file__).parent.parent))
from class_config import CLASS_NAMES
class PerCaseComparator:
"""Compare per_case_2d metrics between two models."""
def __init__(self, model1_report, model2_report, model1_name="Model-1", model2_name="Model-2"):
"""
Initialize comparator.
Args:
model1_report: dict, evaluation report for model 1
model2_report: dict, evaluation report for model 2
model1_name: str, display name for model 1
model2_name: str, display name for model 2
"""
self.model1_report = model1_report
self.model2_report = model2_report
self.model1_name = model1_name
self.model2_name = model2_name
def compare_per_case_metrics(self, threshold=0.1, metric_name='ap'):
"""
Compare per_case_2d metrics and identify cases with significant differences.
Args:
threshold: float, threshold for significant difference (default 0.1 = 10%)
metric_name: str, metric to compare ('ap', 'precision', 'recall')
Returns:
dict with comparison results
"""
print(f"\n{'='*80}")
print(f"Comparing Per-Case 2D Metrics (threshold={threshold*100}%)")
print(f"{'='*80}\n")
# Get per_case_2d data
m1_cases = self.model1_report.get('per_case_2d', {})
m2_cases = self.model2_report.get('per_case_2d', {})
# Find common cases
common_cases = set(m1_cases.keys()) & set(m2_cases.keys())
print(f"Total cases in {self.model1_name}: {len(m1_cases)}")
print(f"Total cases in {self.model2_name}: {len(m2_cases)}")
print(f"Common cases: {len(common_cases)}\n")
# Compare each case
case_comparisons = {}
significant_diffs = []
for case_name in sorted(common_cases):
m1_case = m1_cases[case_name]
m2_case = m2_cases[case_name]
case_comp = {
'case_name': case_name,
'per_class': {},
'max_diff': 0.0,
'max_diff_class': None,
'max_diff_metric': None
}
# Compare per-class metrics
m1_classes = m1_case.get('per_class', {})
m2_classes = m2_case.get('per_class', {})
for class_name in m1_classes.keys():
if class_name not in m2_classes:
continue
m1_class = m1_classes[class_name]
m2_class = m2_classes[class_name]
class_comp = {}
for metric in ['precision', 'recall', 'ap']:
m1_val = m1_class.get(metric, 0.0)
m2_val = m2_class.get(metric, 0.0)
diff = m2_val - m1_val
class_comp[metric] = {
self.model1_name: m1_val,
self.model2_name: m2_val,
'diff': diff,
'abs_diff': abs(diff)
}
# Track maximum difference
if abs(diff) > case_comp['max_diff']:
case_comp['max_diff'] = abs(diff)
case_comp['max_diff_class'] = class_name
case_comp['max_diff_metric'] = metric
# Add count metrics for context
class_comp['counts'] = {
'num_gt': m1_class.get('num_gt', 0),
'num_det_m1': m1_class.get('num_det', 0),
'num_det_m2': m2_class.get('num_det', 0),
}
case_comp['per_class'][class_name] = class_comp
case_comparisons[case_name] = case_comp
# Check if this case has significant differences
if case_comp['max_diff'] >= threshold:
significant_diffs.append({
'case_name': case_name,
'max_diff': case_comp['max_diff'],
'class': case_comp['max_diff_class'],
'metric': case_comp['max_diff_metric'],
'details': case_comp['per_class'][case_comp['max_diff_class']][case_comp['max_diff_metric']]
})
# Sort by maximum difference
significant_diffs.sort(key=lambda x: x['max_diff'], reverse=True)
results = {
'summary': {
'total_common_cases': len(common_cases),
'cases_with_significant_diff': len(significant_diffs),
'threshold': threshold,
'model1_name': self.model1_name,
'model2_name': self.model2_name
},
'significant_differences': significant_diffs,
'all_case_comparisons': case_comparisons
}
return results
def print_significant_differences(self, results, top_n=20):
"""Print top N cases with significant differences."""
sig_diffs = results['significant_differences']
print(f"\n{'='*80}")
print(f"Top {min(top_n, len(sig_diffs))} Cases with Significant Differences")
print(f"{'='*80}\n")
for i, diff in enumerate(sig_diffs[:top_n], 1):
details = diff['details']
print(f"{i}. Case: {diff['case_name']}")
print(f" Class: {diff['class']}, Metric: {diff['metric']}")
print(f" {self.model1_name}: {details[self.model1_name]:.4f}")
print(f" {self.model2_name}: {details[self.model2_name]:.4f}")
print(f" Difference: {details['diff']:+.4f} (abs: {diff['max_diff']:.4f})")
print()
def generate_summary_stats(self, results):
"""Generate summary statistics."""
all_comps = results['all_case_comparisons']
# Collect all differences by class and metric
diffs_by_class_metric = defaultdict(list)
for case_name, case_comp in all_comps.items():
for class_name, class_comp in case_comp['per_class'].items():
for metric in ['precision', 'recall', 'ap']:
diff = class_comp[metric]['diff']
diffs_by_class_metric[(class_name, metric)].append(diff)
# Calculate statistics
stats = {}
for (class_name, metric), diffs in diffs_by_class_metric.items():
diffs_array = np.array(diffs)
stats[f"{class_name}_{metric}"] = {
'mean_diff': float(np.mean(diffs_array)),
'std_diff': float(np.std(diffs_array)),
'median_diff': float(np.median(diffs_array)),
'min_diff': float(np.min(diffs_array)),
'max_diff': float(np.max(diffs_array)),
'num_cases': len(diffs)
}
return stats
def main():
parser = argparse.ArgumentParser(
description='Compare per_case_2d metrics between two model evaluation reports'
)
parser.add_argument('--model1', type=str, required=True,
help='Path to model 1 evaluation_report.json')
parser.add_argument('--model2', type=str, required=True,
help='Path to model 2 evaluation_report.json')
parser.add_argument('--model1-name', type=str, default='Model-1',
help='Display name for model 1')
parser.add_argument('--model2-name', type=str, default='Model-2',
help='Display name for model 2')
parser.add_argument('--threshold', type=float, default=0.1,
help='Threshold for significant difference (default: 0.1 = 10%%)')
parser.add_argument('--output', type=str, default='per_case_comparison.json',
help='Output JSON file path')
parser.add_argument('--top-n', type=int, default=20,
help='Number of top different cases to display (default: 20)')
args = parser.parse_args()
# Load evaluation reports
print(f"Loading {args.model1}...")
with open(args.model1, 'r') as f:
model1_report = json.load(f)
print(f"Loading {args.model2}...")
with open(args.model2, 'r') as f:
model2_report = json.load(f)
# Create comparator
comparator = PerCaseComparator(
model1_report, model2_report,
model1_name=args.model1_name,
model2_name=args.model2_name
)
# Compare metrics
results = comparator.compare_per_case_metrics(threshold=args.threshold)
# Print significant differences
comparator.print_significant_differences(results, top_n=args.top_n)
# Generate summary statistics
print(f"\n{'='*80}")
print("Summary Statistics")
print(f"{'='*80}\n")
stats = comparator.generate_summary_stats(results)
# Print stats for main classes (all 3D classes, skipping vehicle sub-buckets)
for class_name in [n for n in CLASS_NAMES.values() if n in stats or f"{n}_ap" in stats]:
print(f"\n{class_name.upper()}:")
for metric in ['ap', 'precision', 'recall']:
key = f"{class_name}_{metric}"
if key in stats:
s = stats[key]
print(f" {metric:10s}: mean={s['mean_diff']:+.4f}, "
f"std={s['std_diff']:.4f}, median={s['median_diff']:+.4f}, "
f"range=[{s['min_diff']:+.4f}, {s['max_diff']:+.4f}]")
# Save results
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Add summary stats to results
results['summary_statistics'] = stats
with open(output_path, 'w') as f:
json.dump(results, f, indent=2)
print(f"\n{'='*80}")
print(f"Results saved to: {output_path}")
print(f"{'='*80}\n")
if __name__ == '__main__':
main()