yolov26_3d/eval_tools/model_comparison/compare_models.py

#!/usr/bin/env python3
"""
Model Evaluation Comparison Tool

This script compares evaluation results from two different models and generates
comprehensive comparison reports including:
- Overall 2D and 3D metrics comparison
- Per-class performance comparison
- Distance-range based 3D metrics comparison
- Per-case performance comparison
- Statistical significance tests
- Visualization plots

Usage:
    python eval_tools/compare_models.py \
        --model1 eval_results/model1/evaluation_report.json \
        --model2 eval_results/model2/evaluation_report.json \
        --output-dir comparison_results \
        --model1-name "Model-A" \
        --model2-name "Model-B"
"""

import argparse
import json
import os
import re
import sys
from pathlib import Path
import numpy as np
from collections import defaultdict

# Allow importing class_config from the eval_tools root
sys.path.insert(0, str(Path(__file__).parent.parent))
from class_config import REPORT_3D_CLASS_KEYS

# Allow importing from the same directory
sys.path.insert(0, str(Path(__file__).parent))
try:
    from find_common_matches import recompute_3d_stats_from_common_matches as _recompute_range_stats
except ImportError:
    _recompute_range_stats = None


class ModelComparator:
    """Compare evaluation results from two models."""

    def __init__(self, model1_report, model2_report, model1_name="Model-1", model2_name="Model-2",
                 common_matches_data=None,
                 model1_detailed_path=None, model2_detailed_path=None):
        """
        Initialize comparator.

        Args:
            model1_report: dict, evaluation report for model 1
            model2_report: dict, evaluation report for model 2
            model1_name: str, display name for model 1
            model2_name: str, display name for model 2
            common_matches_data: dict, optional data from find_common_matches.py;
                                 if provided, 3D comparison will use common matches only
            model1_detailed_path: str, path to model 1 detailed_3d_matches.json (optional
                                  fallback when common_matches.json lacks per-range data)
            model2_detailed_path: str, path to model 2 detailed_3d_matches.json (optional)
        """
        self.model1_report = model1_report
        self.model2_report = model2_report
        self.model1_name = model1_name
        self.model2_name = model2_name
        self.common_matches_data = common_matches_data
        self.model1_detailed_path = model1_detailed_path
        self.model2_detailed_path = model2_detailed_path

        self.comparison_results = {}

    def compare_2d_metrics(self):
        """Compare 2D detection metrics."""
        print("\n" + "="*80)
        print("Comparing 2D Detection Metrics")
        print("="*80)

        comparison = {
            'overall': {},
            'per_class': {}
        }

        # Overall comparison
        m1_overall = self.model1_report['2d_evaluation']['overall']
        m2_overall = self.model2_report['2d_evaluation']['overall']

        for metric in ['precision', 'recall', 'f1_score', 'map']:
            comparison['overall'][metric] = {
                self.model1_name: m1_overall[metric],
                self.model2_name: m2_overall[metric],
                'diff': m2_overall[metric] - m1_overall[metric],
                'relative_change_%': ((m2_overall[metric] - m1_overall[metric]) / m1_overall[metric] * 100) if m1_overall[metric] > 0 else 0
            }

        # Per-class comparison
        m1_classes = self.model1_report['2d_evaluation']['per_class']
        m2_classes = self.model2_report['2d_evaluation']['per_class']

        for class_name in m1_classes.keys():
            if class_name not in m2_classes:
                continue

            comparison['per_class'][class_name] = {}
            for metric in ['precision', 'recall', 'f1_score', 'ap']:
                m1_val = m1_classes[class_name][metric]
                m2_val = m2_classes[class_name][metric]

                comparison['per_class'][class_name][metric] = {
                    self.model1_name: m1_val,
                    self.model2_name: m2_val,
                    'diff': m2_val - m1_val,
                    'relative_change_%': ((m2_val - m1_val) / m1_val * 100) if m1_val > 0 else 0
                }

        self.comparison_results['2d_metrics'] = comparison
        return comparison

    def _compare_range_bucket(self, m1_range, m2_range):
        """Compare a single distance-range bucket between two models.

        m1_range / m2_range are dicts like those inside 3d_evaluation[class][range_key].
        Returns a comparison dict (same shape as used by generate_text_report).
        """
        if m1_range.get('num_samples', 0) == 0 or m2_range.get('num_samples', 0) == 0:
            return None

        cmp = {}
        for error_type in ['lateral_error', 'longitudinal_error',
                           'longitudinal_relative_error', 'heading_error',
                           'heading_error_relaxed']:
            if error_type not in m1_range or error_type not in m2_range:
                continue
            m1_mean = m1_range[error_type]['mean']
            m2_mean = m2_range[error_type]['mean']
            cmp[error_type] = {
                self.model1_name: {
                    'mean': m1_mean,
                    'std':  m1_range[error_type]['std'],
                    'samples': m1_range['num_samples']
                },
                self.model2_name: {
                    'mean': m2_mean,
                    'std':  m2_range[error_type]['std'],
                    'samples': m2_range['num_samples']
                },
                'diff': m2_mean - m1_mean,
                'relative_change_%': ((m2_mean - m1_mean) / m1_mean * 100) if m1_mean > 0 else 0,
                'improvement': m2_mean < m1_mean
            }
        return cmp if cmp else None

    def _compare_3d_metrics_common_matches(self):
        """Compare 3D metrics using only common matches."""
        comparison = {}

        # Get recomputed stats from common_matches_data
        m1_stats = self.common_matches_data.get('model1_3d_stats', {})
        m2_stats = self.common_matches_data.get('model2_3d_stats', {})
        match_stats = self.common_matches_data.get('match_statistics', {})

        # Detect whether the stats are in the new per-range format
        # New format: m1_stats[class]['overall'] exists
        # Old format: m1_stats[class]['lateral_error'] directly (no 'overall' key)
        def _has_range_format(stats):
            for cls_data in stats.values():
                return 'overall' in cls_data
            return False

        use_range_format = _has_range_format(m1_stats) and _has_range_format(m2_stats)

        # ── Upgrade old-format common_matches.json to per-range format on-the-fly ─────
        if not use_range_format and _recompute_range_stats is not None:
            print("  common_matches.json has legacy flat format — computing per-range stats "
                  "from detailed_3d_matches.json...")
            m1_det = self.model1_detailed_path
            m2_det = self.model2_detailed_path
            if m1_det and m2_det and Path(m1_det).exists() and Path(m2_det).exists():
                with open(m1_det, 'r') as f:
                    m1_detailed = json.load(f)
                with open(m2_det, 'r') as f:
                    m2_detailed = json.load(f)
                raw_common = self.common_matches_data.get('common_matches', {})
                m1_stats = _recompute_range_stats(m1_detailed, raw_common, 'model1')
                m2_stats = _recompute_range_stats(m2_detailed, raw_common, 'model2')
                use_range_format = True
                print("  ✓ Per-range stats computed from detailed match files.")
            else:
                print("  WARNING: detailed_3d_matches.json paths not available or not found; "
                      "distance-range sections will be empty.")
                print(f"    model1 path: {m1_det}")
                print(f"    model2 path: {m2_det}")

        # Print match statistics summary
        print(f"\nMatch Statistics:")
        print(f"  {self.model1_name} Total Matches: {match_stats.get('model1_total', 0):,}")
        print(f"  {self.model2_name} Total Matches: {match_stats.get('model2_total', 0):,}")
        print(f"  Common Matches: {match_stats.get('common', 0):,} "
              f"({match_stats.get('common_percentage_of_model1', 0):.1f}% of {self.model1_name})")
        print(f"  {self.model1_name} Unique: {match_stats.get('model1_unique', 0):,}")
        print(f"  {self.model2_name} Unique: {match_stats.get('model2_unique', 0):,}")

        for class_name in m1_stats.keys():
            if class_name not in m2_stats:
                continue

            m1_class = m1_stats[class_name]
            m2_class = m2_stats[class_name]

            if use_range_format:
                # New format: each class is a dict of range_key -> stats_dict
                comparison[class_name] = {
                    'common_samples': m1_class.get('overall', {}).get('num_samples', 0),
                    'match_info': match_stats.get('per_class', {}).get(class_name, {})
                }
                for range_key in m1_class.keys():
                    if range_key not in m2_class:
                        continue
                    if range_key == 'match_info':
                        continue
                    bucket_cmp = self._compare_range_bucket(m1_class[range_key],
                                                            m2_class[range_key])
                    if bucket_cmp is not None:
                        comparison[class_name][range_key] = bucket_cmp
                # Add reversal info to overall if available
                ov1 = m1_class.get('overall', {})
                ov2 = m2_class.get('overall', {})
                if 'reversal_count' in ov1 and 'reversal_count' in ov2:
                    comparison[class_name].setdefault('overall', {})
                    comparison[class_name]['overall']['reversal_info'] = {
                        self.model1_name: {
                            'count': ov1['reversal_count'],
                            'percentage': ov1.get('reversal_percentage', 0)
                        },
                        self.model2_name: {
                            'count': ov2['reversal_count'],
                            'percentage': ov2.get('reversal_percentage', 0)
                        }
                    }
            else:
                # Legacy format: class stats are flat (no 'overall' key)
                m1_overall = m1_class
                m2_overall = m2_class
                comparison[class_name] = {
                    'overall': {},
                    'common_samples': m1_overall.get('num_samples', 0),
                    'match_info': match_stats.get('per_class', {}).get(class_name, {})
                }
                for error_type in ['lateral_error', 'longitudinal_error', 'heading_error']:
                    if error_type not in m1_overall or error_type not in m2_overall:
                        continue
                    m1_mean = m1_overall[error_type]['mean']
                    m2_mean = m2_overall[error_type]['mean']
                    comparison[class_name]['overall'][error_type] = {
                        self.model1_name: {
                            'mean': m1_mean,
                            'std': m1_overall[error_type]['std'],
                            'samples': m1_overall.get('num_samples', 0)
                        },
                        self.model2_name: {
                            'mean': m2_mean,
                            'std': m2_overall[error_type]['std'],
                            'samples': m2_overall.get('num_samples', 0)
                        },
                        'diff': m2_mean - m1_mean,
                        'relative_change_%': ((m2_mean - m1_mean) / m1_mean * 100) if m1_mean > 0 else 0,
                        'improvement': m2_mean < m1_mean
                    }
                for opt_type in ['longitudinal_relative_error', 'heading_error_relaxed']:
                    if opt_type in m1_overall and opt_type in m2_overall:
                        m1v = m1_overall[opt_type]['mean']
                        m2v = m2_overall[opt_type]['mean']
                        comparison[class_name]['overall'][opt_type] = {
                            self.model1_name: {
                                'mean': m1v, 'std': m1_overall[opt_type]['std'],
                                'samples': m1_overall.get('num_samples', 0)
                            },
                            self.model2_name: {
                                'mean': m2v, 'std': m2_overall[opt_type]['std'],
                                'samples': m2_overall.get('num_samples', 0)
                            },
                            'diff': m2v - m1v,
                            'relative_change_%': ((m2v - m1v) / m1v * 100) if m1v > 0 else 0,
                            'improvement': m2v < m1v
                        }
                if 'reversal_count' in m1_overall and 'reversal_count' in m2_overall:
                    comparison[class_name]['overall']['reversal_info'] = {
                        self.model1_name: {
                            'count': m1_overall['reversal_count'],
                            'percentage': m1_overall.get('reversal_percentage', 0)
                        },
                        self.model2_name: {
                            'count': m2_overall['reversal_count'],
                            'percentage': m2_overall.get('reversal_percentage', 0)
                        }
                    }

        self.comparison_results['3d_metrics'] = comparison
        self.comparison_results['match_statistics'] = match_stats
        return comparison

    def compare_3d_metrics(self):
        """Compare 3D detection metrics."""
        print("\n" + "="*80)
        if self.common_matches_data:
            print("Comparing 3D Detection Metrics (COMMON MATCHES ONLY)")
        else:
            print("Comparing 3D Detection Metrics")
        print("="*80)

        # If using common matches, use precomputed stats
        if self.common_matches_data:
            return self._compare_3d_metrics_common_matches()

        comparison = {}

        m1_3d = self.model1_report.get('3d_evaluation', {})
        m2_3d = self.model2_report.get('3d_evaluation', {})

        for class_name in m1_3d.keys():
            if class_name not in m2_3d:
                continue

            comparison[class_name] = {}

            # Check if distance-range based
            if 'overall' in m1_3d[class_name]:
                # Compare overall and per-range
                for range_key in m1_3d[class_name].keys():
                    if range_key not in m2_3d[class_name]:
                        continue

                    m1_range = m1_3d[class_name][range_key]
                    m2_range = m2_3d[class_name][range_key]

                    if m1_range['num_samples'] == 0 and m2_range['num_samples'] == 0:
                        continue

                    comparison[class_name][range_key] = {}

                    for error_type in ['lateral_error', 'longitudinal_error', 'heading_error']:
                        if m1_range['num_samples'] > 0 and m2_range['num_samples'] > 0:
                            m1_mean = m1_range[error_type]['mean']
                            m2_mean = m2_range[error_type]['mean']

                            comparison[class_name][range_key][error_type] = {
                                self.model1_name: {
                                    'mean': m1_mean,
                                    'std': m1_range[error_type]['std'],
                                    'samples': m1_range['num_samples']
                                },
                                self.model2_name: {
                                    'mean': m2_mean,
                                    'std': m2_range[error_type]['std'],
                                    'samples': m2_range['num_samples']
                                },
                                'diff': m2_mean - m1_mean,
                                'relative_change_%': ((m2_mean - m1_mean) / m1_mean * 100) if m1_mean > 0 else 0,
                                'improvement': m1_mean > m2_mean  # Lower error is better
                            }
                    if (m1_range['num_samples'] > 0 and m2_range['num_samples'] > 0
                            and 'longitudinal_relative_error' in m1_range
                            and 'longitudinal_relative_error' in m2_range):
                        m1_mean_lr = m1_range['longitudinal_relative_error']['mean']
                        m2_mean_lr = m2_range['longitudinal_relative_error']['mean']
                        comparison[class_name][range_key]['longitudinal_relative_error'] = {
                            self.model1_name: {
                                'mean': m1_mean_lr,
                                'std': m1_range['longitudinal_relative_error']['std'],
                                'samples': m1_range['num_samples']
                            },
                            self.model2_name: {
                                'mean': m2_mean_lr,
                                'std': m2_range['longitudinal_relative_error']['std'],
                                'samples': m2_range['num_samples']
                            },
                            'diff': m2_mean_lr - m1_mean_lr,
                            'relative_change_%': ((m2_mean_lr - m1_mean_lr) / m1_mean_lr * 100) if m1_mean_lr > 0 else 0,
                            'improvement': m2_mean_lr < m1_mean_lr
                        }
                    if (m1_range['num_samples'] > 0 and m2_range['num_samples'] > 0
                            and 'heading_error_relaxed' in m1_range
                            and 'heading_error_relaxed' in m2_range):
                        m1_mean_hr = m1_range['heading_error_relaxed']['mean']
                        m2_mean_hr = m2_range['heading_error_relaxed']['mean']
                        comparison[class_name][range_key]['heading_error_relaxed'] = {
                            self.model1_name: {
                                'mean': m1_mean_hr,
                                'std': m1_range['heading_error_relaxed']['std'],
                                'samples': m1_range['num_samples']
                            },
                            self.model2_name: {
                                'mean': m2_mean_hr,
                                'std': m2_range['heading_error_relaxed']['std'],
                                'samples': m2_range['num_samples']
                            },
                            'diff': m2_mean_hr - m1_mean_hr,
                            'relative_change_%': ((m2_mean_hr - m1_mean_hr) / m1_mean_hr * 100) if m1_mean_hr > 0 else 0,
                            'improvement': m2_mean_hr < m1_mean_hr
                        }
            else:
                # Legacy format
                m1_class = m1_3d[class_name]
                m2_class = m2_3d[class_name]

                if m1_class['num_samples'] == 0 and m2_class['num_samples'] == 0:
                    continue

                comparison[class_name]['overall'] = {}

                for error_type in ['lateral_error', 'longitudinal_error', 'heading_error']:
                    if m1_class['num_samples'] > 0 and m2_class['num_samples'] > 0:
                        m1_mean = m1_class[error_type]['mean']
                        m2_mean = m2_class[error_type]['mean']

                        comparison[class_name]['overall'][error_type] = {
                            self.model1_name: {
                                'mean': m1_mean,
                                'std': m1_class[error_type]['std'],
                                'samples': m1_class['num_samples']
                            },
                            self.model2_name: {
                                'mean': m2_mean,
                                'std': m2_class[error_type]['std'],
                                'samples': m2_class['num_samples']
                            },
                            'diff': m2_mean - m1_mean,
                            'relative_change_%': ((m2_mean - m1_mean) / m1_mean * 100) if m1_mean > 0 else 0,
                            'improvement': m1_mean > m2_mean
                        }
                if (m1_class['num_samples'] > 0 and m2_class['num_samples'] > 0
                        and 'longitudinal_relative_error' in m1_class
                        and 'longitudinal_relative_error' in m2_class):
                    m1_mean_lr = m1_class['longitudinal_relative_error']['mean']
                    m2_mean_lr = m2_class['longitudinal_relative_error']['mean']
                    comparison[class_name]['overall']['longitudinal_relative_error'] = {
                        self.model1_name: {
                            'mean': m1_mean_lr,
                            'std': m1_class['longitudinal_relative_error']['std'],
                            'samples': m1_class['num_samples']
                        },
                        self.model2_name: {
                            'mean': m2_mean_lr,
                            'std': m2_class['longitudinal_relative_error']['std'],
                            'samples': m2_class['num_samples']
                        },
                        'diff': m2_mean_lr - m1_mean_lr,
                        'relative_change_%': ((m2_mean_lr - m1_mean_lr) / m1_mean_lr * 100) if m1_mean_lr > 0 else 0,
                        'improvement': m2_mean_lr < m1_mean_lr
                    }

        self.comparison_results['3d_metrics'] = comparison
        return comparison

    def compare_per_case(self):
        """Compare per-case performance."""
        print("\n" + "="*80)
        print("Comparing Per-Case Performance")
        print("="*80)

        comparison = {
            '2d': {},
            '3d': {}
        }

        # Get common cases
        m1_cases_2d = set(self.model1_report.get('per_case_2d', {}).keys())
        m2_cases_2d = set(self.model2_report.get('per_case_2d', {}).keys())
        common_cases = m1_cases_2d.intersection(m2_cases_2d)

        print(f"Found {len(common_cases)} common cases")

        # 2D per-case comparison
        for case_name in sorted(common_cases):
            m1_case = self.model1_report['per_case_2d'][case_name]
            m2_case = self.model2_report['per_case_2d'][case_name]

            comparison['2d'][case_name] = {}

            # Overall metrics
            comparison['2d'][case_name]['overall'] = {}
            for metric in ['precision', 'recall', 'f1_score', 'map']:
                m1_val = m1_case['overall'][metric]
                m2_val = m2_case['overall'][metric]

                comparison['2d'][case_name]['overall'][metric] = {
                    self.model1_name: m1_val,
                    self.model2_name: m2_val,
                    'diff': m2_val - m1_val,
                    'relative_change_%': ((m2_val - m1_val) / m1_val * 100) if m1_val > 0 else 0
                }

        # 3D per-case comparison
        m1_cases_3d = set(self.model1_report.get('per_case_3d', {}).keys())
        m2_cases_3d = set(self.model2_report.get('per_case_3d', {}).keys())
        common_cases_3d = m1_cases_3d.intersection(m2_cases_3d)

        for case_name in sorted(common_cases_3d):
            m1_case = self.model1_report['per_case_3d'][case_name]
            m2_case = self.model2_report['per_case_3d'][case_name]

            comparison['3d'][case_name] = {}

            # Compare 3D classes
            for class_name in REPORT_3D_CLASS_KEYS:
                if class_name not in m1_case or class_name not in m2_case:
                    continue

                comparison['3d'][case_name][class_name] = {}

                # Get overall metrics
                if 'overall' in m1_case[class_name]:
                    m1_overall = m1_case[class_name]['overall']
                    m2_overall = m2_case[class_name]['overall']
                else:
                    m1_overall = m1_case[class_name]
                    m2_overall = m2_case[class_name]

                if m1_overall['num_samples'] == 0 or m2_overall['num_samples'] == 0:
                    continue

                for error_type in ['lateral_error', 'longitudinal_error']:
                    m1_mean = m1_overall[error_type]['mean']
                    m2_mean = m2_overall[error_type]['mean']

                    comparison['3d'][case_name][class_name][error_type] = {
                        self.model1_name: m1_mean,
                        self.model2_name: m2_mean,
                        'diff': m2_mean - m1_mean,
                        'improvement': m1_mean > m2_mean
                    }

        self.comparison_results['per_case'] = comparison
        return comparison

    def generate_summary_stats(self):
        """Generate summary statistics."""
        print("\n" + "="*80)
        print("Generating Summary Statistics")
        print("="*80)

        summary = {
            '2d': {
                'ap': {
                    'wins': 0,  # Number of classes where model2 is better
                    'losses': 0,
                    'ties': 0
                },
                'f1_score': {
                    'wins': 0,
                    'losses': 0,
                    'ties': 0
                }
            },
            '3d': {
                'lateral': {
                    'wins': 0,
                    'losses': 0,
                    'ties': 0
                },
                'longitudinal': {
                    'wins': 0,
                    'losses': 0,
                    'ties': 0
                },
                'heading': {
                    'wins': 0,
                    'losses': 0,
                    'ties': 0
                }
            }
        }

        # Count 2D wins/losses based on AP
        if '2d_metrics' in self.comparison_results:
            for class_name, metrics in self.comparison_results['2d_metrics']['per_class'].items():
                if 'ap' in metrics:
                    diff = metrics['ap']['diff']
                    if abs(diff) < 0.01:  # Consider < 1% as tie
                        summary['2d']['ap']['ties'] += 1
                    elif diff > 0:
                        summary['2d']['ap']['wins'] += 1
                    else:
                        summary['2d']['ap']['losses'] += 1

                # Count based on F1 Score
                if 'f1_score' in metrics:
                    diff = metrics['f1_score']['diff']
                    if abs(diff) < 0.01:  # Consider < 1% as tie
                        summary['2d']['f1_score']['ties'] += 1
                    elif diff > 0:
                        summary['2d']['f1_score']['wins'] += 1
                    else:
                        summary['2d']['f1_score']['losses'] += 1

        # Count 3D wins/losses based on all error types
        if '3d_metrics' in self.comparison_results:
            for class_name, ranges in self.comparison_results['3d_metrics'].items():
                for range_key, metrics in ranges.items():
                    # Skip non-metric fields (like 'common_samples', 'match_info')
                    if not isinstance(metrics, dict):
                        continue
                    if 'lateral_error' not in metrics:
                        continue

                    # Count lateral error
                    if metrics['lateral_error']['improvement']:
                        summary['3d']['lateral']['wins'] += 1
                    else:
                        summary['3d']['lateral']['losses'] += 1

                    # Count longitudinal error
                    if 'longitudinal_error' in metrics:
                        if metrics['longitudinal_error']['improvement']:
                            summary['3d']['longitudinal']['wins'] += 1
                        else:
                            summary['3d']['longitudinal']['losses'] += 1

                    # Count heading error
                    if 'heading_error' in metrics:
                        if metrics['heading_error']['improvement']:
                            summary['3d']['heading']['wins'] += 1
                        else:
                            summary['3d']['heading']['losses'] += 1

        self.comparison_results['summary'] = summary
        return summary

    def generate_text_report(self, output_file):
        """Generate human-readable text report."""
        print(f"\nGenerating text report: {output_file}")

        with open(output_file, 'w') as f:
            f.write("="*80 + "\n")
            f.write("MODEL COMPARISON REPORT\n")
            f.write("="*80 + "\n\n")

            f.write(f"Model 1: {self.model1_name}\n")
            f.write(f"Model 2: {self.model2_name}\n\n")

            # 2D Overall Comparison
            if '2d_metrics' in self.comparison_results:
                f.write("\n" + "="*80 + "\n")
                f.write("2D DETECTION METRICS - OVERALL COMPARISON\n")
                f.write("="*80 + "\n\n")

                overall = self.comparison_results['2d_metrics']['overall']
                f.write(f"{'Metric':<15} {self.model1_name:<12} {self.model2_name:<12} {'Diff':<12} {'Change %':<12}\n")
                f.write("-"*80 + "\n")

                for metric, values in overall.items():
                    f.write(f"{metric.upper():<15} "
                           f"{values[self.model1_name]:<12.4f} "
                           f"{values[self.model2_name]:<12.4f} "
                           f"{values['diff']:>+11.4f} "
                           f"{values['relative_change_%']:>+11.2f}%\n")

                # Per-class 2D comparison - detailed metrics
                f.write("\n" + "="*80 + "\n")
                f.write("2D DETECTION METRICS - PER-CLASS COMPARISON\n")
                f.write("="*80 + "\n\n")

                # Precision, Recall, F1 Score comparison
                f.write("Precision / Recall / F1 Score:\n")
                f.write(f"{'Class':<15} {'Metric':<12} {self.model1_name:<12} {self.model2_name:<12} {'Diff':<12} {'Change %':<12}\n")
                f.write("-"*100 + "\n")

                per_class = self.comparison_results['2d_metrics']['per_class']
                for class_name in sorted(per_class.keys()):
                    for metric_name in ['precision', 'recall', 'f1_score']:
                        metric_data = per_class[class_name][metric_name]
                        f.write(f"{class_name:<15} "
                               f"{metric_name:<12} "
                               f"{metric_data[self.model1_name]:<12.4f} "
                               f"{metric_data[self.model2_name]:<12.4f} "
                               f"{metric_data['diff']:>+11.4f} "
                               f"{metric_data['relative_change_%']:>+11.2f}%\n")

                # AP comparison
                f.write("\nAverage Precision (AP):\n")
                f.write(f"{'Class':<15} {self.model1_name:<12} {self.model2_name:<12} {'Diff':<12} {'Change %':<12}\n")
                f.write("-"*80 + "\n")

                for class_name in sorted(per_class.keys()):
                    ap_data = per_class[class_name]['ap']
                    f.write(f"{class_name:<15} "
                           f"{ap_data[self.model1_name]:<12.4f} "
                           f"{ap_data[self.model2_name]:<12.4f} "
                           f"{ap_data['diff']:>+11.4f} "
                           f"{ap_data['relative_change_%']:>+11.2f}%\n")

            # 3D Comparison
            if '3d_metrics' in self.comparison_results:
                f.write("\n" + "="*80 + "\n")
                f.write("3D DETECTION METRICS COMPARISON\n")
                f.write("="*80 + "\n\n")

                # First, write table format summary for overall metrics
                f.write("OVERALL 3D METRICS SUMMARY (by class)\n")
                f.write("-"*80 + "\n\n")

                for class_name, ranges in sorted(self.comparison_results['3d_metrics'].items()):
                    if 'overall' not in ranges:
                        continue

                    overall = ranges['overall']
                    if 'lateral_error' not in overall:
                        continue

                    f.write(f"{class_name.upper()}:\n")
                    f.write(f"{'Metric':<20} {self.model1_name:<15} {self.model2_name:<15} {'Diff':<15} {'Change %':<12} {'Result':<10}\n")
                    f.write("-"*100 + "\n")

                    for error_type, display_name in [('lateral_error', 'Lateral (m)'),
                                                      ('longitudinal_error', 'Longitudinal (m)'),
                                                      ('longitudinal_relative_error', 'Long Relative'),
                                                      ('heading_error', 'Heading (rad)')]:
                        if error_type in overall:
                            data = overall[error_type]
                            m1_str = f"{data[self.model1_name]['mean']:.4f}"
                            m2_str = f"{data[self.model2_name]['mean']:.4f}"
                            diff_str = f"{data['diff']:+.4f}"
                            change_str = f"{data['relative_change_%']:+.2f}%"
                            result_str = "✓ BETTER" if data['improvement'] else "✗ WORSE"

                            f.write(f"{display_name:<20} {m1_str:<15} {m2_str:<15} {diff_str:<15} {change_str:<12} {result_str:<10}\n")

                    # Add relaxed heading error if available
                    if 'heading_error_relaxed' in overall:
                        data = overall['heading_error_relaxed']
                        m1_str = f"{data[self.model1_name]['mean']:.4f}"
                        m2_str = f"{data[self.model2_name]['mean']:.4f}"
                        diff_str = f"{data['diff']:+.4f}"
                        change_str = f"{data['relative_change_%']:+.2f}%"
                        result_str = "✓ BETTER" if data['improvement'] else "✗ WORSE"

                        f.write(f"{'Heading Relaxed (rad)':<20} {m1_str:<15} {m2_str:<15} {diff_str:<15} {change_str:<12} {result_str:<10}\n")

                    # Add reversal statistics if available
                    if 'reversal_info' in overall:
                        rev_info = overall['reversal_info']
                        m1_rev = f"{rev_info[self.model1_name]['count']} ({rev_info[self.model1_name]['percentage']:.1f}%)"
                        m2_rev = f"{rev_info[self.model2_name]['count']} ({rev_info[self.model2_name]['percentage']:.1f}%)"
                        f.write(f"{'Reversal Cases':<20} {m1_rev:<15} {m2_rev:<15}\n")

                    f.write(f"Samples: {self.model1_name}={overall['lateral_error'][self.model1_name]['samples']}, "
                           f"{self.model2_name}={overall['lateral_error'][self.model2_name]['samples']}\n")
                    f.write("\n")

                # Helper: extract numeric start value from range keys like
                # "long_0-10m", "long_100-999m", "lat_-50--40m", "lat_-10-0m", "overall"
                def _range_sort_key(range_key):
                    if range_key == 'overall':
                        return float('inf')
                    try:
                        # Strip prefix (long_ / lat_) and trailing 'm'
                        stripped = range_key
                        for prefix in ('long_', 'lat_'):
                            if stripped.startswith(prefix):
                                stripped = stripped[len(prefix):]
                                break
                        stripped = stripped.rstrip('m')
                        # Find the separator dash: the '-' immediately preceded by a digit.
                        # This correctly handles negatives like "-50--40" (separator after '0')
                        # as well as "-10-0", "0-10", "100-999", etc.
                        m = re.search(r'(?<=\d)-', stripped)
                        if m:
                            return float(stripped[:m.start()])
                        return float('inf')
                    except (ValueError, IndexError):
                        return float('inf')

                # Helper: write one range block
                def _write_range_block(f, range_key, metrics):
                    f.write(f"\n  [{range_key}]:\n")
                    f.write(f"  {'Metric':<22} {self.model1_name:<14} {self.model2_name:<14} {'Diff':<14} {'Change %':<11} Result\n")
                    f.write("  " + "-"*82 + "\n")
                    for error_type, display_name in [
                        ('lateral_error',              'Lateral (m)'),
                        ('longitudinal_error',         'Longitudinal (m)'),
                        ('longitudinal_relative_error','Long Relative'),
                        ('heading_error',              'Heading (rad)'),
                        ('heading_error_relaxed',      'Head Relaxed (rad)'),
                    ]:
                        if error_type not in metrics:
                            continue
                        data = metrics[error_type]
                        m1_str    = f"{data[self.model1_name]['mean']:.4f}"
                        m2_str    = f"{data[self.model2_name]['mean']:.4f}"
                        diff_str  = f"{data['diff']:+.4f}"
                        change_str= f"{data['relative_change_%']:+.2f}%"
                        result    = "✓" if data['improvement'] else "✗"
                        f.write(f"  {display_name:<22} {m1_str:<14} {m2_str:<14} {diff_str:<14} {change_str:<11} {result}\n")
                    m1_n = metrics.get('lateral_error', metrics.get('longitudinal_error', {})).get(self.model1_name, {}).get('samples', '-')
                    m2_n = metrics.get('lateral_error', metrics.get('longitudinal_error', {})).get(self.model2_name, {}).get('samples', '-')
                    f.write(f"  Samples: {self.model1_name}={m1_n}  {self.model2_name}={m2_n}\n")

                # ── Longitudinal distance ranges ──────────────────────────────
                f.write("\n" + "-"*80 + "\n")
                f.write("3D METRICS BY LONGITUDINAL DISTANCE RANGE (long_*)\n")
                f.write("-"*80 + "\n")

                for class_name, ranges in sorted(self.comparison_results['3d_metrics'].items()):
                    # Collect longitudinal-range entries
                    long_items = {k: v for k, v in ranges.items()
                                  if isinstance(v, dict) and k.startswith('long_') and 'lateral_error' in v}
                    if not long_items:
                        continue

                    f.write(f"\n{class_name.upper()}:\n")
                    f.write("-"*80 + "\n")

                    # Compact summary table: one row per distance range, key metrics only
                    col1 = 12
                    f.write(f"\n  Quick summary (mean errors):\n")
                    f.write(f"  {'Range':<12} {'Samples':>8}  "
                            f"{'Lat(m)':>8} {'':>9}  "
                            f"{'Long(m)':>8} {'':>9}  "
                            f"{'LongRel':>8} {'':>9}  "
                            f"{'Head(rad)':>9} {''}\n")
                    hdr2 = (f"  {'':12} {'':>8}  "
                            f"{self.model1_name:>8} {self.model2_name:>9}  "
                            f"{self.model1_name:>8} {self.model2_name:>9}  "
                            f"{self.model1_name:>8} {self.model2_name:>9}  "
                            f"{self.model1_name:>9} {self.model2_name}\n")
                    f.write(hdr2)
                    f.write("  " + "-"*110 + "\n")

                    for rk, metrics in sorted(long_items.items(), key=lambda x: _range_sort_key(x[0])):
                        label = rk.replace('long_', '')
                        n1 = metrics.get('lateral_error', {}).get(self.model1_name, {}).get('samples', 0)
                        lat1  = metrics['lateral_error'][self.model1_name]['mean'] if 'lateral_error' in metrics else float('nan')
                        lat2  = metrics['lateral_error'][self.model2_name]['mean'] if 'lateral_error' in metrics else float('nan')
                        lon1  = metrics['longitudinal_error'][self.model1_name]['mean'] if 'longitudinal_error' in metrics else float('nan')
                        lon2  = metrics['longitudinal_error'][self.model2_name]['mean'] if 'longitudinal_error' in metrics else float('nan')
                        lr1   = metrics['longitudinal_relative_error'][self.model1_name]['mean'] if 'longitudinal_relative_error' in metrics else float('nan')
                        lr2   = metrics['longitudinal_relative_error'][self.model2_name]['mean'] if 'longitudinal_relative_error' in metrics else float('nan')
                        h1    = metrics['heading_error'][self.model1_name]['mean'] if 'heading_error' in metrics else float('nan')
                        h2    = metrics['heading_error'][self.model2_name]['mean'] if 'heading_error' in metrics else float('nan')
                        lat_mark  = "✓" if lat2  < lat1  else "✗"
                        lon_mark  = "✓" if lon2  < lon1  else "✗"
                        lr_mark   = "✓" if lr2   < lr1   else "✗"
                        head_mark = "✓" if h2    < h1    else "✗"
                        f.write(f"  {label:<12} {n1:>8}  "
                                f"{lat1:>8.4f} {lat2:>8.4f}{lat_mark}  "
                                f"{lon1:>8.4f} {lon2:>8.4f}{lon_mark}  "
                                f"{lr1:>8.4f} {lr2:>8.4f}{lr_mark}  "
                                f"{h1:>9.4f} {h2:>8.4f}{head_mark}\n")

                    # Detailed per-range blocks
                    f.write(f"\n  Detailed breakdown:\n")
                    for rk, metrics in sorted(long_items.items(), key=lambda x: _range_sort_key(x[0])):
                        _write_range_block(f, rk, metrics)

                # ── Lateral distance ranges ───────────────────────────────────
                f.write("\n\n" + "-"*80 + "\n")
                f.write("3D METRICS BY LATERAL DISTANCE RANGE (lat_*)\n")
                f.write("-"*80 + "\n")

                for class_name, ranges in sorted(self.comparison_results['3d_metrics'].items()):
                    lat_items = {k: v for k, v in ranges.items()
                                 if isinstance(v, dict) and k.startswith('lat_') and 'lateral_error' in v}
                    if not lat_items:
                        continue

                    f.write(f"\n{class_name.upper()}:\n")
                    f.write("-"*80 + "\n")

                    f.write(f"\n  Quick summary (mean errors):\n")
                    f.write(f"  {'Range':<14} {'Samples':>8}  "
                            f"{'Lat(m)':>8} {'':>9}  "
                            f"{'Long(m)':>8} {'':>9}  "
                            f"{'LongRel':>8} {'':>9}  "
                            f"{'Head(rad)':>9} {''}\n")
                    hdr2 = (f"  {'':14} {'':>8}  "
                            f"{self.model1_name:>8} {self.model2_name:>9}  "
                            f"{self.model1_name:>8} {self.model2_name:>9}  "
                            f"{self.model1_name:>8} {self.model2_name:>9}  "
                            f"{self.model1_name:>9} {self.model2_name}\n")
                    f.write(hdr2)
                    f.write("  " + "-"*110 + "\n")

                    for rk, metrics in sorted(lat_items.items(), key=lambda x: _range_sort_key(x[0])):
                        label = rk.replace('lat_', '')
                        n1 = metrics.get('lateral_error', {}).get(self.model1_name, {}).get('samples', 0)
                        lat1  = metrics['lateral_error'][self.model1_name]['mean'] if 'lateral_error' in metrics else float('nan')
                        lat2  = metrics['lateral_error'][self.model2_name]['mean'] if 'lateral_error' in metrics else float('nan')
                        lon1  = metrics['longitudinal_error'][self.model1_name]['mean'] if 'longitudinal_error' in metrics else float('nan')
                        lon2  = metrics['longitudinal_error'][self.model2_name]['mean'] if 'longitudinal_error' in metrics else float('nan')
                        lr1   = metrics['longitudinal_relative_error'][self.model1_name]['mean'] if 'longitudinal_relative_error' in metrics else float('nan')
                        lr2   = metrics['longitudinal_relative_error'][self.model2_name]['mean'] if 'longitudinal_relative_error' in metrics else float('nan')
                        h1    = metrics['heading_error'][self.model1_name]['mean'] if 'heading_error' in metrics else float('nan')
                        h2    = metrics['heading_error'][self.model2_name]['mean'] if 'heading_error' in metrics else float('nan')
                        lat_mark  = "✓" if lat2  < lat1  else "✗"
                        lon_mark  = "✓" if lon2  < lon1  else "✗"
                        lr_mark   = "✓" if lr2   < lr1   else "✗"
                        head_mark = "✓" if h2    < h1    else "✗"
                        f.write(f"  {label:<14} {n1:>8}  "
                                f"{lat1:>8.4f} {lat2:>8.4f}{lat_mark}  "
                                f"{lon1:>8.4f} {lon2:>8.4f}{lon_mark}  "
                                f"{lr1:>8.4f} {lr2:>8.4f}{lr_mark}  "
                                f"{h1:>9.4f} {h2:>8.4f}{head_mark}\n")

                    f.write(f"\n  Detailed breakdown:\n")
                    for rk, metrics in sorted(lat_items.items(), key=lambda x: _range_sort_key(x[0])):
                        _write_range_block(f, rk, metrics)

            # Summary
            if 'summary' in self.comparison_results:
                f.write("\n" + "="*80 + "\n")
                f.write("SUMMARY\n")
                f.write("="*80 + "\n\n")

                summary = self.comparison_results['summary']

                f.write(f"2D Detection (by AP):\n")
                f.write(f"  {self.model2_name} wins: {summary['2d']['ap']['wins']}\n")
                f.write(f"  {self.model1_name} wins: {summary['2d']['ap']['losses']}\n")
                f.write(f"  Ties: {summary['2d']['ap']['ties']}\n\n")

                f.write(f"2D Detection (by F1 Score):\n")
                f.write(f"  {self.model2_name} wins: {summary['2d']['f1_score']['wins']}\n")
                f.write(f"  {self.model1_name} wins: {summary['2d']['f1_score']['losses']}\n")
                f.write(f"  Ties: {summary['2d']['f1_score']['ties']}\n\n")

                f.write(f"3D Detection:\n")
                f.write(f"  By Lateral Error:\n")
                f.write(f"    {self.model2_name} wins: {summary['3d']['lateral']['wins']}\n")
                f.write(f"    {self.model1_name} wins: {summary['3d']['lateral']['losses']}\n")
                f.write(f"    Ties: {summary['3d']['lateral']['ties']}\n")
                f.write(f"  By Longitudinal Error:\n")
                f.write(f"    {self.model2_name} wins: {summary['3d']['longitudinal']['wins']}\n")
                f.write(f"    {self.model1_name} wins: {summary['3d']['longitudinal']['losses']}\n")
                f.write(f"    Ties: {summary['3d']['longitudinal']['ties']}\n")
                f.write(f"  By Heading Error:\n")
                f.write(f"    {self.model2_name} wins: {summary['3d']['heading']['wins']}\n")
                f.write(f"    {self.model1_name} wins: {summary['3d']['heading']['losses']}\n")
                f.write(f"    Ties: {summary['3d']['heading']['ties']}\n")

        print(f"✓ Text report saved to: {output_file}")

    def generate_json_report(self, output_file):
        """Generate JSON report."""
        print(f"\nGenerating JSON report: {output_file}")

        with open(output_file, 'w') as f:
            json.dump(self.comparison_results, f, indent=2)

        print(f"✓ JSON report saved to: {output_file}")

    def compare_all(self):
        """Run all comparisons."""
        self.compare_2d_metrics()
        self.compare_3d_metrics()
        self.compare_per_case()
        self.generate_summary_stats()

        return self.comparison_results


def main():
    """Main function."""
    parser = argparse.ArgumentParser(
        description='Compare evaluation results from two models',
        formatter_class=argparse.RawDescriptionHelpFormatter
    )

    parser.add_argument('--model1', type=str, required=True,
                       help='Path to model 1 evaluation report JSON')
    parser.add_argument('--model2', type=str, required=True,
                       help='Path to model 2 evaluation report JSON')
    parser.add_argument('--output-dir', type=str, default='comparison_results',
                       help='Output directory for comparison results')
    parser.add_argument('--model1-name', type=str, default='Model-1',
                       help='Display name for model 1')
    parser.add_argument('--model2-name', type=str, default='Model-2',
                       help='Display name for model 2')
    parser.add_argument('--common-matches', type=str, default=None,
                       help='Path to common_matches.json from find_common_matches.py. '
                            'If provided, 3D comparison will use common matches only.')

    args = parser.parse_args()

    # Load reports
    print("="*80)
    print("MODEL COMPARISON TOOL")
    print("="*80)
    print(f"\nLoading model 1: {args.model1}")
    with open(args.model1, 'r') as f:
        model1_report = json.load(f)

    print(f"Loading model 2: {args.model2}")
    with open(args.model2, 'r') as f:
        model2_report = json.load(f)

    # Load common matches if provided
    common_matches_data = None
    if args.common_matches:
        print(f"Loading common matches: {args.common_matches}")
        with open(args.common_matches, 'r') as f:
            common_matches_data = json.load(f)
        print("✓ Will use common matches for 3D comparison")

    # Create output directory
    os.makedirs(args.output_dir, exist_ok=True)

    # Derive detailed_3d_matches.json paths (same dir as evaluation_report.json)
    model1_detailed = str(Path(args.model1).parent / 'detailed_3d_matches.json')
    model2_detailed = str(Path(args.model2).parent / 'detailed_3d_matches.json')

    # Compare models
    comparator = ModelComparator(
        model1_report,
        model2_report,
        model1_name=args.model1_name,
        model2_name=args.model2_name,
        common_matches_data=common_matches_data,
        model1_detailed_path=model1_detailed,
        model2_detailed_path=model2_detailed,
    )

    results = comparator.compare_all()

    # Generate reports
    text_output = os.path.join(args.output_dir, 'comparison_report.txt')
    json_output = os.path.join(args.output_dir, 'comparison_report.json')

    comparator.generate_text_report(text_output)
    comparator.generate_json_report(json_output)

    print("\n" + "="*80)
    print("COMPARISON COMPLETE")
    print("="*80)
    print(f"\nResults saved to: {args.output_dir}/")
    print(f"  - Text report: comparison_report.txt")
    print(f"  - JSON report: comparison_report.json")
    if common_matches_data:
        print(f"\nNote: 3D metrics comparison is based on common matches only.")
        print(f"      Matched by both models: {common_matches_data['match_statistics']['common']:,} samples")
    print("")


if __name__ == '__main__':
    main()