#!/usr/bin/env python3 """ Per-Case 2D Metrics Comparison Tool This script compares per_case_2d metrics from two model evaluation reports and identifies cases with significant metric differences. Usage: python eval_tools/model_comparison/compare_per_case_2d.py \ --model1 evaluation_results/.../evaluation_report.json \ --model2 evaluation_results/.../evaluation_report.json \ --threshold 0.1 \ --output comparison_per_case_2d.json Example: python eval_tools/model_comparison/compare_per_case_2d.py \ --model1 evaluation_results/eval_results_common_match_comparison_CNCAP_roi0/mono3d/20260211_113153/evaluation_report.json \ --model2 evaluation_results/eval_results_common_match_comparison_CNCAP_roi0/yolov5s-300w-newdata/20260211_113153/evaluation_report.json \ --threshold 0.1 \ --output per_case_comparison.json """ import argparse import json import os from pathlib import Path from collections import defaultdict import sys import numpy as np # Allow importing class_config from the eval_tools root sys.path.insert(0, str(Path(__file__).parent.parent)) from class_config import CLASS_NAMES class PerCaseComparator: """Compare per_case_2d metrics between two models.""" def __init__(self, model1_report, model2_report, model1_name="Model-1", model2_name="Model-2"): """ Initialize comparator. Args: model1_report: dict, evaluation report for model 1 model2_report: dict, evaluation report for model 2 model1_name: str, display name for model 1 model2_name: str, display name for model 2 """ self.model1_report = model1_report self.model2_report = model2_report self.model1_name = model1_name self.model2_name = model2_name def compare_per_case_metrics(self, threshold=0.1, metric_name='ap'): """ Compare per_case_2d metrics and identify cases with significant differences. Args: threshold: float, threshold for significant difference (default 0.1 = 10%) metric_name: str, metric to compare ('ap', 'precision', 'recall') Returns: dict with comparison results """ print(f"\n{'='*80}") print(f"Comparing Per-Case 2D Metrics (threshold={threshold*100}%)") print(f"{'='*80}\n") # Get per_case_2d data m1_cases = self.model1_report.get('per_case_2d', {}) m2_cases = self.model2_report.get('per_case_2d', {}) # Find common cases common_cases = set(m1_cases.keys()) & set(m2_cases.keys()) print(f"Total cases in {self.model1_name}: {len(m1_cases)}") print(f"Total cases in {self.model2_name}: {len(m2_cases)}") print(f"Common cases: {len(common_cases)}\n") # Compare each case case_comparisons = {} significant_diffs = [] for case_name in sorted(common_cases): m1_case = m1_cases[case_name] m2_case = m2_cases[case_name] case_comp = { 'case_name': case_name, 'per_class': {}, 'max_diff': 0.0, 'max_diff_class': None, 'max_diff_metric': None } # Compare per-class metrics m1_classes = m1_case.get('per_class', {}) m2_classes = m2_case.get('per_class', {}) for class_name in m1_classes.keys(): if class_name not in m2_classes: continue m1_class = m1_classes[class_name] m2_class = m2_classes[class_name] class_comp = {} for metric in ['precision', 'recall', 'ap']: m1_val = m1_class.get(metric, 0.0) m2_val = m2_class.get(metric, 0.0) diff = m2_val - m1_val class_comp[metric] = { self.model1_name: m1_val, self.model2_name: m2_val, 'diff': diff, 'abs_diff': abs(diff) } # Track maximum difference if abs(diff) > case_comp['max_diff']: case_comp['max_diff'] = abs(diff) case_comp['max_diff_class'] = class_name case_comp['max_diff_metric'] = metric # Add count metrics for context class_comp['counts'] = { 'num_gt': m1_class.get('num_gt', 0), 'num_det_m1': m1_class.get('num_det', 0), 'num_det_m2': m2_class.get('num_det', 0), } case_comp['per_class'][class_name] = class_comp case_comparisons[case_name] = case_comp # Check if this case has significant differences if case_comp['max_diff'] >= threshold: significant_diffs.append({ 'case_name': case_name, 'max_diff': case_comp['max_diff'], 'class': case_comp['max_diff_class'], 'metric': case_comp['max_diff_metric'], 'details': case_comp['per_class'][case_comp['max_diff_class']][case_comp['max_diff_metric']] }) # Sort by maximum difference significant_diffs.sort(key=lambda x: x['max_diff'], reverse=True) results = { 'summary': { 'total_common_cases': len(common_cases), 'cases_with_significant_diff': len(significant_diffs), 'threshold': threshold, 'model1_name': self.model1_name, 'model2_name': self.model2_name }, 'significant_differences': significant_diffs, 'all_case_comparisons': case_comparisons } return results def print_significant_differences(self, results, top_n=20): """Print top N cases with significant differences.""" sig_diffs = results['significant_differences'] print(f"\n{'='*80}") print(f"Top {min(top_n, len(sig_diffs))} Cases with Significant Differences") print(f"{'='*80}\n") for i, diff in enumerate(sig_diffs[:top_n], 1): details = diff['details'] print(f"{i}. Case: {diff['case_name']}") print(f" Class: {diff['class']}, Metric: {diff['metric']}") print(f" {self.model1_name}: {details[self.model1_name]:.4f}") print(f" {self.model2_name}: {details[self.model2_name]:.4f}") print(f" Difference: {details['diff']:+.4f} (abs: {diff['max_diff']:.4f})") print() def generate_summary_stats(self, results): """Generate summary statistics.""" all_comps = results['all_case_comparisons'] # Collect all differences by class and metric diffs_by_class_metric = defaultdict(list) for case_name, case_comp in all_comps.items(): for class_name, class_comp in case_comp['per_class'].items(): for metric in ['precision', 'recall', 'ap']: diff = class_comp[metric]['diff'] diffs_by_class_metric[(class_name, metric)].append(diff) # Calculate statistics stats = {} for (class_name, metric), diffs in diffs_by_class_metric.items(): diffs_array = np.array(diffs) stats[f"{class_name}_{metric}"] = { 'mean_diff': float(np.mean(diffs_array)), 'std_diff': float(np.std(diffs_array)), 'median_diff': float(np.median(diffs_array)), 'min_diff': float(np.min(diffs_array)), 'max_diff': float(np.max(diffs_array)), 'num_cases': len(diffs) } return stats def main(): parser = argparse.ArgumentParser( description='Compare per_case_2d metrics between two model evaluation reports' ) parser.add_argument('--model1', type=str, required=True, help='Path to model 1 evaluation_report.json') parser.add_argument('--model2', type=str, required=True, help='Path to model 2 evaluation_report.json') parser.add_argument('--model1-name', type=str, default='Model-1', help='Display name for model 1') parser.add_argument('--model2-name', type=str, default='Model-2', help='Display name for model 2') parser.add_argument('--threshold', type=float, default=0.1, help='Threshold for significant difference (default: 0.1 = 10%%)') parser.add_argument('--output', type=str, default='per_case_comparison.json', help='Output JSON file path') parser.add_argument('--top-n', type=int, default=20, help='Number of top different cases to display (default: 20)') args = parser.parse_args() # Load evaluation reports print(f"Loading {args.model1}...") with open(args.model1, 'r') as f: model1_report = json.load(f) print(f"Loading {args.model2}...") with open(args.model2, 'r') as f: model2_report = json.load(f) # Create comparator comparator = PerCaseComparator( model1_report, model2_report, model1_name=args.model1_name, model2_name=args.model2_name ) # Compare metrics results = comparator.compare_per_case_metrics(threshold=args.threshold) # Print significant differences comparator.print_significant_differences(results, top_n=args.top_n) # Generate summary statistics print(f"\n{'='*80}") print("Summary Statistics") print(f"{'='*80}\n") stats = comparator.generate_summary_stats(results) # Print stats for main classes (all 3D classes, skipping vehicle sub-buckets) for class_name in [n for n in CLASS_NAMES.values() if n in stats or f"{n}_ap" in stats]: print(f"\n{class_name.upper()}:") for metric in ['ap', 'precision', 'recall']: key = f"{class_name}_{metric}" if key in stats: s = stats[key] print(f" {metric:10s}: mean={s['mean_diff']:+.4f}, " f"std={s['std_diff']:.4f}, median={s['median_diff']:+.4f}, " f"range=[{s['min_diff']:+.4f}, {s['max_diff']:+.4f}]") # Save results output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) # Add summary stats to results results['summary_statistics'] = stats with open(output_path, 'w') as f: json.dump(results, f, indent=2) print(f"\n{'='*80}") print(f"Results saved to: {output_path}") print(f"{'='*80}\n") if __name__ == '__main__': main()