tools/pdcl_inference/get_clips_of_aeb.py

"""
从 AEB 场地计划表或 aeb_rawid.json 读取 rawid，查询每个 rawid 对应的
clip ukey 列表，并保存为可直接用于批量推理的单个 aeb_clips*.json。

默认输入：
  tools/pdcl_inference/G1Q3项目AEB场地计划表.xlsx
  默认读取 "实际测试进展" 和 "复测case" 两个 sheet。

兼容旧流程：
  也支持通过 --input 读取已生成的 aeb_rawid.json。

依赖：
    pip install pdcl_dss -i https://pypi.minieye.tech/

默认输出命名：
  未指定 --output 时，按 Excel 中 "CVE数据" 列的最小/最大时间自动命名，
  例如 aeb_clips-20260322152509_to_20260410183944.json

输出格式：
{
  "summary": {
    "total_scenarios": 123,
    "total_rawids": 456,
    "scenario_total_rawids": {
      "CPFA-25-6.5-20": 12
    }
  },
  "scenarios": {
    "CPFA-25-6.5-20": [
      {
        "sheet": "实际测试进展",
        "场景": "CPFA",
        "偏置": "25",
        "目标速度": "6.5",
        "自车速度": "20",
        "CVE数据": "20260323113612",
        "rawid": "ADAS_...",
        "clips": ["clip_ukey1", ...]
      }
    ]
  }
}
"""

import argparse
import json
import os
import re
import sys
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import pandas as pd

FILE = Path(__file__).resolve()
ROOT = FILE.parents[2]
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

DEFAULT_XLSX = os.path.join(os.path.dirname(__file__), "G1Q3项目AEB场地计划表.xlsx")
DEFAULT_SHEET_NAME = "实际测试进展"
DEFAULT_RETEST_SHEET_NAME = "复测case"
DEFAULT_SHEET_NAMES = [DEFAULT_SHEET_NAME, DEFAULT_RETEST_SHEET_NAME]
DEFAULT_OUTPUT_PREFIX = "aeb_clips"

os.environ.setdefault("STS_UID", "dis-uploader")
os.environ.setdefault("STS_SECRET_KEY", "277310cc09724d315514a79701fecb0f")

try:
    from dotenv import load_dotenv

    load_dotenv()
except ImportError:
    pass


def _to_str(val):
    """将单元格值转换为字符串，NaN 返回 None。"""
    if pd.isna(val):
        return None
    if isinstance(val, float) and val == int(val):
        return str(int(val))
    return str(val).strip()


def _build_column_index(header: List[object]) -> Dict[str, int]:
    """根据表头名称构建列索引映射。"""
    header_map = {}
    for idx, value in enumerate(header):
        name = _to_str(value)
        if name:
            header_map[name] = idx

    required_columns = ["场景", "偏置", "目标速度", "自车速度", "rawid", "CVE数据"]
    missing = [name for name in required_columns if name not in header_map]
    if missing:
        raise ValueError(f"Excel 缺少必需列: {missing}；实际表头: {list(header_map.keys())}")
    return header_map


def _normalize_cve_timestamp(val: Optional[str]) -> Optional[str]:
    """将 CVE 数据列清洗为可用于命名的时间串。"""
    if not val:
        return None

    digits = re.sub(r"\D", "", val)
    if len(digits) >= 14:
        return digits[:14]
    if len(digits) >= 8:
        return digits[:8]
    return None


def _forward_fill_series(series: pd.Series) -> pd.Series:
    """仅对当前列做简单前向填充，避免 pandas 对 object 列的告警。"""
    filled = []
    last_valid = None
    for value in series.tolist():
        if not pd.isna(value):
            last_valid = value
            filled.append(value)
        else:
            filled.append(last_valid)
    return pd.Series(filled, index=series.index, dtype=object)


def _build_scenario_key(
    scenario: Optional[str],
    offset: Optional[str],
    target_speed: Optional[str],
    ego_speed: Optional[str],
) -> str:
    """构造完整工况键：场景-偏置-目标速度-自车速度。"""
    parts = [
        scenario or "未知工况",
        offset or "未知偏置",
        target_speed or "未知目标速度",
        ego_speed or "未知自车速度",
    ]
    return "-".join(parts)


def _cluster_records(records: List[dict], fallback_scenario: Optional[str] = None) -> Dict[str, List[dict]]:
    """按完整工况键聚类记录。"""
    clustered: Dict[str, List[dict]] = {}
    for record in records:
        scenario = _to_str(record.get("场景")) or fallback_scenario or "未知工况"
        offset = _to_str(record.get("偏置"))
        target_speed = _to_str(record.get("目标速度"))
        ego_speed = _to_str(record.get("自车速度"))
        scenario_key = _build_scenario_key(scenario, offset, target_speed, ego_speed)

        normalized_record = dict(record)
        normalized_record["场景"] = scenario
        clustered.setdefault(scenario_key, []).append(normalized_record)
    return clustered


def _merge_clustered_records(grouped_records: List[Dict[str, List[dict]]]) -> Dict[str, List[dict]]:
    """合并多个分组结果。"""
    merged: Dict[str, List[dict]] = {}
    for grouped in grouped_records:
        for scenario_key, records in grouped.items():
            merged.setdefault(scenario_key, []).extend(records)
    return merged


def parse_excel_sheet(xlsx_source, sheet_name: str = DEFAULT_SHEET_NAME) -> Dict[str, List[dict]]:
    """解析单个 Excel sheet，并按完整工况键聚类 rawid 记录。"""
    df = pd.read_excel(xlsx_source, sheet_name=sheet_name, header=None)

    header = list(df.iloc[0])
    column_index = _build_column_index(header)

    data = df.iloc[1:].copy().reset_index(drop=True)
    for name in ["场景", "偏置", "目标速度", "自车速度"]:
        data.iloc[:, column_index[name]] = _forward_fill_series(
            data.iloc[:, column_index[name]]
        )

    records: List[dict] = []
    for _, row in data.iterrows():
        rawid = _to_str(row.iloc[column_index["rawid"]])
        if rawid is None:
            continue

        record = {
            "sheet": sheet_name,
            "场景": _to_str(row.iloc[column_index["场景"]]) or "未知工况",
            "偏置": _to_str(row.iloc[column_index["偏置"]]),
            "目标速度": _to_str(row.iloc[column_index["目标速度"]]),
            "自车速度": _to_str(row.iloc[column_index["自车速度"]]),
            "CVE数据": _to_str(row.iloc[column_index["CVE数据"]]),
            "rawid": rawid,
        }
        records.append(record)

    return _cluster_records(records)


def parse_excel(
    xlsx_path: str,
    sheet_names: Optional[List[str]] = None,
    allow_missing_sheets: bool = False,
) -> Dict[str, List[dict]]:
    """解析一个或多个 Excel sheet，并合并为按完整工况键聚类的 rawid 记录。"""
    requested_sheet_names = sheet_names or DEFAULT_SHEET_NAMES
    excel = pd.ExcelFile(xlsx_path)
    available_sheet_names = set(excel.sheet_names)

    missing_sheet_names = [
        sheet_name
        for sheet_name in requested_sheet_names
        if sheet_name not in available_sheet_names
    ]
    if missing_sheet_names and not allow_missing_sheets:
        raise ValueError(
            f"Excel 缺少 sheet: {missing_sheet_names}；实际 sheet: {excel.sheet_names}"
        )

    if missing_sheet_names:
        print(
            f"警告：Excel 缺少 sheet {missing_sheet_names}，已跳过；"
            f"实际 sheet: {excel.sheet_names}"
        )

    grouped_records = []
    for sheet_name in requested_sheet_names:
        if sheet_name not in available_sheet_names:
            continue
        grouped_records.append(parse_excel_sheet(excel, sheet_name=sheet_name))

    if not grouped_records:
        raise ValueError(
            f"Excel 未找到任何可解析 sheet；期望 sheet: {requested_sheet_names}；"
            f"实际 sheet: {excel.sheet_names}"
        )

    return _merge_clustered_records(grouped_records)


def load_rawid_json(json_path: str) -> Dict[str, List[dict]]:
    """读取旧格式 aeb_rawid.json。"""
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, dict):
        raise ValueError(f"输入 JSON 顶层必须是 dict，实际: {type(data).__name__}")

    if "scenarios" in data:
        scenario_data = data["scenarios"]
        if not isinstance(scenario_data, dict):
            raise ValueError("输入 JSON 的 scenarios 字段必须是 dict")
        data = scenario_data

    grouped_records = []
    for scenario_name, records in data.items():
        if not isinstance(records, list):
            raise ValueError(
                f"场景 {scenario_name} 对应的数据必须是 list，实际: {type(records).__name__}"
            )
        grouped_records.append(_cluster_records(records, fallback_scenario=_to_str(scenario_name)))
    return _merge_clustered_records(grouped_records)


def get_clip_ukeys_from_raw(raw_id: str) -> List[str]:
    """通过 raw_id 获取关联的 clip ukey 列表。"""
    from pdcl_dss import Raw

    with Raw(raw_id) as raw:
        return raw.list_clip_ukeys()


def get_cve_time_range(rawid_data: Dict[str, List[dict]]) -> Optional[Tuple[str, str]]:
    """从记录中提取 CVE 数据时间范围。"""
    timestamps = []
    for records in rawid_data.values():
        for record in records:
            timestamp = _normalize_cve_timestamp(_to_str(record.get("CVE数据")))
            if timestamp:
                timestamps.append(timestamp)

    if not timestamps:
        return None
    return min(timestamps), max(timestamps)


def resolve_output_path(
    output_arg: Optional[str],
    source_path: str,
    rawid_data: Dict[str, List[dict]],
) -> str:
    """解析最终输出路径；未显式指定时按 CVE 时间范围自动命名。"""
    if output_arg:
        return output_arg

    cve_time_range = get_cve_time_range(rawid_data)
    if cve_time_range is None:
        filename = f"{DEFAULT_OUTPUT_PREFIX}.json"
    else:
        start, end = cve_time_range
        suffix = start if start == end else f"{start}_to_{end}"
        filename = f"{DEFAULT_OUTPUT_PREFIX}-{suffix}.json"

    return os.path.join(os.path.dirname(source_path), filename)


def build_output_payload(scenario_records: Dict[str, List[dict]]) -> dict:
    """构造包含 summary 和 scenarios 的最终输出。"""
    total_rawids = sum(len(records) for records in scenario_records.values())
    return {
        "summary": {
            "total_scenarios": len(scenario_records),
            "total_rawids": total_rawids,
            "scenario_total_rawids": {
                scenario: len(records)
                for scenario, records in scenario_records.items()
            },
        },
        "scenarios": scenario_records,
    }


def build_clips_manifest(rawid_data: Dict[str, List[dict]]) -> Dict[str, List[dict]]:
    """为每条 rawid 记录补充 clips 字段。"""
    result: Dict[str, List[dict]] = {}
    clip_cache: Dict[str, List[str]] = {}

    total_rawids = sum(
        1
        for records in rawid_data.values()
        for record in records
        if record.get("rawid")
    )
    processed = 0

    for scenario, records in rawid_data.items():
        result[scenario] = []
        print(f"\n=== 工况: {scenario} ({len(records)} 条 rawid) ===")

        for rec in records:
            raw_id = rec.get("rawid")
            if not raw_id:
                print("  跳过缺少 rawid 的记录")
                continue

            processed += 1
            print(f"  [{processed}/{total_rawids}] {raw_id} ...", end=" ", flush=True)

            if raw_id in clip_cache:
                clips = clip_cache[raw_id]
                print(f"复用缓存，找到 {len(clips)} 个 clip")
            else:
                try:
                    clips = get_clip_ukeys_from_raw(raw_id)
                    clip_cache[raw_id] = clips
                    print(f"找到 {len(clips)} 个 clip")
                except Exception as e:
                    clips = []
                    clip_cache[raw_id] = clips
                    print(f"失败: {e}")

            entry = dict(rec)
            entry["clips"] = clips
            result[scenario].append(entry)

    return result


def print_summary(
    result: Dict[str, List[dict]],
    output_path: str,
    cve_time_range: Optional[Tuple[str, str]] = None,
) -> None:
    total_rawids = sum(len(records) for records in result.values())
    total_clips = sum(
        len(entry.get("clips", []))
        for records in result.values()
        for entry in records
    )
    print(f"\n已保存至：{output_path}")
    if cve_time_range:
        start, end = cve_time_range
        if start == end:
            print(f"CVE数据时间：{start}")
        else:
            print(f"CVE数据时间范围：{start} ~ {end}")
    print(f"共 {len(result)} 个工况，{total_rawids} 条 rawid，{total_clips} 个 clip")


def parse_args():
    parser = argparse.ArgumentParser(
        description="从 AEB 表格或 aeb_rawid.json 直接生成 aeb_clips.json。"
    )
    parser.add_argument(
        "--xlsx",
        default=None,
        help=(
            "Excel 文件路径；未指定且 --input 也未指定时，默认读取脚本同目录下的 "
            "G1Q3项目AEB场地计划表.xlsx"
        ),
    )
    parser.add_argument(
        "--sheet-name",
        default=None,
        help=(
            "Excel 单个 sheet 名称；指定后只读取该 sheet。"
            f"未指定时默认读取：{', '.join(DEFAULT_SHEET_NAMES)}"
        ),
    )
    parser.add_argument(
        "--sheet-names",
        default=None,
        help=(
            "Excel 多个 sheet 名称，用英文逗号分隔；"
            f"未指定时默认读取：{', '.join(DEFAULT_SHEET_NAMES)}"
        ),
    )
    parser.add_argument(
        "--input",
        default=None,
        help="兼容旧流程：输入 aeb_rawid.json 文件路径",
    )
    parser.add_argument(
        "--output",
        default=None,
        help="输出 JSON 文件路径；未指定时自动按 CVE数据 时间范围命名",
    )
    return parser.parse_args()


def main():
    args = parse_args()

    if args.xlsx and args.input:
        raise ValueError("--xlsx 和 --input 不能同时指定，请二选一。")
    if args.sheet_name and args.sheet_names:
        raise ValueError("--sheet-name 和 --sheet-names 不能同时指定，请二选一。")

    if args.input:
        print(f"读取 rawid JSON：{args.input}")
        rawid_data = load_rawid_json(args.input)
        source_path = args.input
    else:
        xlsx_path = args.xlsx or DEFAULT_XLSX
        print(f"读取 Excel：{xlsx_path}")
        if args.sheet_name:
            sheet_names = [args.sheet_name]
            allow_missing_sheets = False
        elif args.sheet_names:
            sheet_names = [
                sheet_name.strip()
                for sheet_name in args.sheet_names.split(",")
                if sheet_name.strip()
            ]
            if not sheet_names:
                raise ValueError("--sheet-names 至少需要指定一个有效 sheet 名称")
            allow_missing_sheets = False
        else:
            sheet_names = DEFAULT_SHEET_NAMES
            allow_missing_sheets = True

        print(f"读取 sheet：{', '.join(sheet_names)}")
        rawid_data = parse_excel(
            xlsx_path,
            sheet_names=sheet_names,
            allow_missing_sheets=allow_missing_sheets,
        )
        source_path = xlsx_path

    output_path = resolve_output_path(args.output, source_path, rawid_data)
    cve_time_range = get_cve_time_range(rawid_data)

    total_rawids = sum(len(records) for records in rawid_data.values())
    print(f"待查询 {len(rawid_data)} 个工况，{total_rawids} 条 rawid 记录")

    result = build_clips_manifest(rawid_data)
    output_payload = build_output_payload(result)

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_payload, f, ensure_ascii=False, indent=2)

    print_summary(result, output_path, cve_time_range=cve_time_range)


if __name__ == "__main__":
    main()