286 lines
11 KiB
Python
Executable File
286 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Per-Case 2D Metrics Comparison Tool
|
|
|
|
This script compares per_case_2d metrics from two model evaluation reports
|
|
and identifies cases with significant metric differences.
|
|
|
|
Usage:
|
|
python eval_tools/model_comparison/compare_per_case_2d.py \
|
|
--model1 evaluation_results/.../evaluation_report.json \
|
|
--model2 evaluation_results/.../evaluation_report.json \
|
|
--threshold 0.1 \
|
|
--output comparison_per_case_2d.json
|
|
|
|
Example:
|
|
python eval_tools/model_comparison/compare_per_case_2d.py \
|
|
--model1 evaluation_results/eval_results_common_match_comparison_CNCAP_roi0/mono3d/20260211_113153/evaluation_report.json \
|
|
--model2 evaluation_results/eval_results_common_match_comparison_CNCAP_roi0/yolov5s-300w-newdata/20260211_113153/evaluation_report.json \
|
|
--threshold 0.1 \
|
|
--output per_case_comparison.json
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
import sys
|
|
import numpy as np
|
|
|
|
# Allow importing class_config from the eval_tools root
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
from class_config import CLASS_NAMES
|
|
|
|
|
|
class PerCaseComparator:
|
|
"""Compare per_case_2d metrics between two models."""
|
|
|
|
def __init__(self, model1_report, model2_report, model1_name="Model-1", model2_name="Model-2"):
|
|
"""
|
|
Initialize comparator.
|
|
|
|
Args:
|
|
model1_report: dict, evaluation report for model 1
|
|
model2_report: dict, evaluation report for model 2
|
|
model1_name: str, display name for model 1
|
|
model2_name: str, display name for model 2
|
|
"""
|
|
self.model1_report = model1_report
|
|
self.model2_report = model2_report
|
|
self.model1_name = model1_name
|
|
self.model2_name = model2_name
|
|
|
|
def compare_per_case_metrics(self, threshold=0.1, metric_name='ap'):
|
|
"""
|
|
Compare per_case_2d metrics and identify cases with significant differences.
|
|
|
|
Args:
|
|
threshold: float, threshold for significant difference (default 0.1 = 10%)
|
|
metric_name: str, metric to compare ('ap', 'precision', 'recall')
|
|
|
|
Returns:
|
|
dict with comparison results
|
|
"""
|
|
print(f"\n{'='*80}")
|
|
print(f"Comparing Per-Case 2D Metrics (threshold={threshold*100}%)")
|
|
print(f"{'='*80}\n")
|
|
|
|
# Get per_case_2d data
|
|
m1_cases = self.model1_report.get('per_case_2d', {})
|
|
m2_cases = self.model2_report.get('per_case_2d', {})
|
|
|
|
# Find common cases
|
|
common_cases = set(m1_cases.keys()) & set(m2_cases.keys())
|
|
print(f"Total cases in {self.model1_name}: {len(m1_cases)}")
|
|
print(f"Total cases in {self.model2_name}: {len(m2_cases)}")
|
|
print(f"Common cases: {len(common_cases)}\n")
|
|
|
|
# Compare each case
|
|
case_comparisons = {}
|
|
significant_diffs = []
|
|
|
|
for case_name in sorted(common_cases):
|
|
m1_case = m1_cases[case_name]
|
|
m2_case = m2_cases[case_name]
|
|
|
|
case_comp = {
|
|
'case_name': case_name,
|
|
'per_class': {},
|
|
'max_diff': 0.0,
|
|
'max_diff_class': None,
|
|
'max_diff_metric': None
|
|
}
|
|
|
|
# Compare per-class metrics
|
|
m1_classes = m1_case.get('per_class', {})
|
|
m2_classes = m2_case.get('per_class', {})
|
|
|
|
for class_name in m1_classes.keys():
|
|
if class_name not in m2_classes:
|
|
continue
|
|
|
|
m1_class = m1_classes[class_name]
|
|
m2_class = m2_classes[class_name]
|
|
|
|
class_comp = {}
|
|
for metric in ['precision', 'recall', 'ap']:
|
|
m1_val = m1_class.get(metric, 0.0)
|
|
m2_val = m2_class.get(metric, 0.0)
|
|
diff = m2_val - m1_val
|
|
|
|
class_comp[metric] = {
|
|
self.model1_name: m1_val,
|
|
self.model2_name: m2_val,
|
|
'diff': diff,
|
|
'abs_diff': abs(diff)
|
|
}
|
|
|
|
# Track maximum difference
|
|
if abs(diff) > case_comp['max_diff']:
|
|
case_comp['max_diff'] = abs(diff)
|
|
case_comp['max_diff_class'] = class_name
|
|
case_comp['max_diff_metric'] = metric
|
|
|
|
# Add count metrics for context
|
|
class_comp['counts'] = {
|
|
'num_gt': m1_class.get('num_gt', 0),
|
|
'num_det_m1': m1_class.get('num_det', 0),
|
|
'num_det_m2': m2_class.get('num_det', 0),
|
|
}
|
|
|
|
case_comp['per_class'][class_name] = class_comp
|
|
|
|
case_comparisons[case_name] = case_comp
|
|
|
|
# Check if this case has significant differences
|
|
if case_comp['max_diff'] >= threshold:
|
|
significant_diffs.append({
|
|
'case_name': case_name,
|
|
'max_diff': case_comp['max_diff'],
|
|
'class': case_comp['max_diff_class'],
|
|
'metric': case_comp['max_diff_metric'],
|
|
'details': case_comp['per_class'][case_comp['max_diff_class']][case_comp['max_diff_metric']]
|
|
})
|
|
|
|
# Sort by maximum difference
|
|
significant_diffs.sort(key=lambda x: x['max_diff'], reverse=True)
|
|
|
|
results = {
|
|
'summary': {
|
|
'total_common_cases': len(common_cases),
|
|
'cases_with_significant_diff': len(significant_diffs),
|
|
'threshold': threshold,
|
|
'model1_name': self.model1_name,
|
|
'model2_name': self.model2_name
|
|
},
|
|
'significant_differences': significant_diffs,
|
|
'all_case_comparisons': case_comparisons
|
|
}
|
|
|
|
return results
|
|
|
|
def print_significant_differences(self, results, top_n=20):
|
|
"""Print top N cases with significant differences."""
|
|
sig_diffs = results['significant_differences']
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"Top {min(top_n, len(sig_diffs))} Cases with Significant Differences")
|
|
print(f"{'='*80}\n")
|
|
|
|
for i, diff in enumerate(sig_diffs[:top_n], 1):
|
|
details = diff['details']
|
|
print(f"{i}. Case: {diff['case_name']}")
|
|
print(f" Class: {diff['class']}, Metric: {diff['metric']}")
|
|
print(f" {self.model1_name}: {details[self.model1_name]:.4f}")
|
|
print(f" {self.model2_name}: {details[self.model2_name]:.4f}")
|
|
print(f" Difference: {details['diff']:+.4f} (abs: {diff['max_diff']:.4f})")
|
|
print()
|
|
|
|
def generate_summary_stats(self, results):
|
|
"""Generate summary statistics."""
|
|
all_comps = results['all_case_comparisons']
|
|
|
|
# Collect all differences by class and metric
|
|
diffs_by_class_metric = defaultdict(list)
|
|
|
|
for case_name, case_comp in all_comps.items():
|
|
for class_name, class_comp in case_comp['per_class'].items():
|
|
for metric in ['precision', 'recall', 'ap']:
|
|
diff = class_comp[metric]['diff']
|
|
diffs_by_class_metric[(class_name, metric)].append(diff)
|
|
|
|
# Calculate statistics
|
|
stats = {}
|
|
for (class_name, metric), diffs in diffs_by_class_metric.items():
|
|
diffs_array = np.array(diffs)
|
|
stats[f"{class_name}_{metric}"] = {
|
|
'mean_diff': float(np.mean(diffs_array)),
|
|
'std_diff': float(np.std(diffs_array)),
|
|
'median_diff': float(np.median(diffs_array)),
|
|
'min_diff': float(np.min(diffs_array)),
|
|
'max_diff': float(np.max(diffs_array)),
|
|
'num_cases': len(diffs)
|
|
}
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Compare per_case_2d metrics between two model evaluation reports'
|
|
)
|
|
parser.add_argument('--model1', type=str, required=True,
|
|
help='Path to model 1 evaluation_report.json')
|
|
parser.add_argument('--model2', type=str, required=True,
|
|
help='Path to model 2 evaluation_report.json')
|
|
parser.add_argument('--model1-name', type=str, default='Model-1',
|
|
help='Display name for model 1')
|
|
parser.add_argument('--model2-name', type=str, default='Model-2',
|
|
help='Display name for model 2')
|
|
parser.add_argument('--threshold', type=float, default=0.1,
|
|
help='Threshold for significant difference (default: 0.1 = 10%%)')
|
|
parser.add_argument('--output', type=str, default='per_case_comparison.json',
|
|
help='Output JSON file path')
|
|
parser.add_argument('--top-n', type=int, default=20,
|
|
help='Number of top different cases to display (default: 20)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Load evaluation reports
|
|
print(f"Loading {args.model1}...")
|
|
with open(args.model1, 'r') as f:
|
|
model1_report = json.load(f)
|
|
|
|
print(f"Loading {args.model2}...")
|
|
with open(args.model2, 'r') as f:
|
|
model2_report = json.load(f)
|
|
|
|
# Create comparator
|
|
comparator = PerCaseComparator(
|
|
model1_report, model2_report,
|
|
model1_name=args.model1_name,
|
|
model2_name=args.model2_name
|
|
)
|
|
|
|
# Compare metrics
|
|
results = comparator.compare_per_case_metrics(threshold=args.threshold)
|
|
|
|
# Print significant differences
|
|
comparator.print_significant_differences(results, top_n=args.top_n)
|
|
|
|
# Generate summary statistics
|
|
print(f"\n{'='*80}")
|
|
print("Summary Statistics")
|
|
print(f"{'='*80}\n")
|
|
stats = comparator.generate_summary_stats(results)
|
|
|
|
# Print stats for main classes (all 3D classes, skipping vehicle sub-buckets)
|
|
for class_name in [n for n in CLASS_NAMES.values() if n in stats or f"{n}_ap" in stats]:
|
|
print(f"\n{class_name.upper()}:")
|
|
for metric in ['ap', 'precision', 'recall']:
|
|
key = f"{class_name}_{metric}"
|
|
if key in stats:
|
|
s = stats[key]
|
|
print(f" {metric:10s}: mean={s['mean_diff']:+.4f}, "
|
|
f"std={s['std_diff']:.4f}, median={s['median_diff']:+.4f}, "
|
|
f"range=[{s['min_diff']:+.4f}, {s['max_diff']:+.4f}]")
|
|
|
|
# Save results
|
|
output_path = Path(args.output)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Add summary stats to results
|
|
results['summary_statistics'] = stats
|
|
|
|
with open(output_path, 'w') as f:
|
|
json.dump(results, f, indent=2)
|
|
|
|
print(f"\n{'='*80}")
|
|
print(f"Results saved to: {output_path}")
|
|
print(f"{'='*80}\n")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|