yolov26_3d/tools/model_inference/data_tools/extract_excel_column.py

from __future__ import annotations

import argparse
import csv
import json
import re
import sys
from pathlib import Path

try:
    from openpyxl import load_workbook
except ImportError:
    load_workbook = None


FILE = Path(__file__).resolve()
DEFAULT_INPUT_FILE = FILE.parents[1] / "examples" / "cncap" / "G1M3_AFS1616_CNCAP-2024_11月_0306.xlsx"
DEFAULT_COLUMN_NAME = "原始数据地址"


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Extract one column from a CSV/XLSX table and print or save the values."
    )
    parser.add_argument(
        "--input-file",
        type=str,
        default=str(DEFAULT_INPUT_FILE),
        help="Path to the input .xlsx or .csv file.",
    )
    parser.add_argument(
        "--column-name",
        type=str,
        default=DEFAULT_COLUMN_NAME,
        help="Header name of the target column.",
    )
    parser.add_argument(
        "--sheet-name",
        type=str,
        default="",
        help="Worksheet name for .xlsx files. Defaults to the first sheet.",
    )
    parser.add_argument(
        "--output-file",
        type=str,
        default="",
        help="Optional output text file. If omitted, values are written to stdout.",
    )
    parser.add_argument(
        "--json-file",
        type=str,
        default="",
        help="Optional output json file. Defaults to a sibling json file next to the input table.",
    )
    parser.add_argument(
        "--dedupe",
        action="store_true",
        help="Remove duplicate values while preserving the original order.",
    )
    parser.add_argument(
        "--list-columns",
        action="store_true",
        help="List the discovered header names and exit.",
    )
    return parser.parse_args()


def normalize_header(value: str) -> str:
    return re.sub(r"\s+", "", str(value or "")).strip()


def sanitize_filename(value: str) -> str:
    sanitized = re.sub(r'[\\/:*?"<>|]+', "_", str(value or "").strip())
    sanitized = re.sub(r"\s+", "_", sanitized)
    return sanitized.strip("._") or "column"


def build_default_json_path(input_path: Path, column_name: str) -> Path:
    return input_path.with_name(f"{input_path.stem}_{sanitize_filename(column_name)}.json")


def load_xlsx_table(path: Path, sheet_name: str) -> tuple[list[str], list[dict[str, str]], str]:
    if load_workbook is None:
        raise ImportError("openpyxl is required for .xlsx files. Please install it first.")

    workbook = load_workbook(path, read_only=True, data_only=True)
    try:
        if sheet_name:
            if sheet_name not in workbook.sheetnames:
                available = ", ".join(workbook.sheetnames)
                raise ValueError(f"Worksheet {sheet_name!r} not found. Available sheets: {available}")
            worksheet = workbook[sheet_name]
        else:
            worksheet = workbook[workbook.sheetnames[0]]

        header_row_values: list[str] | None = None
        header_indices: list[int] = []
        records: list[dict[str, str]] = []

        for row in worksheet.iter_rows(values_only=True):
            normalized_row = [str(value).strip() if value is not None else "" for value in row]
            if not any(normalized_row):
                continue

            if header_row_values is None:
                header_indices = [index for index, value in enumerate(normalized_row) if value]
                header_row_values = [normalized_row[index] for index in header_indices]
                continue

            record = {
                header_row_values[index]: normalized_row[col_idx] if col_idx < len(normalized_row) else ""
                for index, col_idx in enumerate(header_indices)
            }
            if any(record.values()):
                records.append(record)

        if header_row_values is None:
            raise ValueError("The worksheet is empty.")

        return header_row_values, records, worksheet.title
    finally:
        workbook.close()


def load_csv_table(path: Path) -> tuple[list[str], list[dict[str, str]], str]:
    with path.open("r", encoding="utf-8-sig", newline="") as file:
        sample = file.read(4096)
        file.seek(0)
        dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
        reader = csv.DictReader(file, dialect=dialect)
        headers = reader.fieldnames or []
        records = []
        for row in reader:
            normalized_row = {str(key).strip(): str(value or "").strip() for key, value in row.items() if key is not None}
            if any(normalized_row.values()):
                records.append(normalized_row)
    return [str(header).strip() for header in headers], records, ""


def load_table(path: Path, sheet_name: str) -> tuple[list[str], list[dict[str, str]], str]:
    suffix = path.suffix.lower()
    if suffix == ".xlsx":
        return load_xlsx_table(path, sheet_name)
    if suffix == ".csv":
        return load_csv_table(path)
    raise ValueError(f"Unsupported input format: {suffix}. Only .xlsx and .csv are supported.")


def extract_column(records: list[dict[str, str]], column_name: str, dedupe: bool) -> list[str]:
    if not records:
        return []

    normalized_to_actual = {normalize_header(name): name for name in records[0].keys()}
    target_key = normalized_to_actual.get(normalize_header(column_name))
    if target_key is None:
        available = ", ".join(records[0].keys())
        raise ValueError(f"Column {column_name!r} not found. Available columns: {available}")

    values = [record.get(target_key, "").strip() for record in records]
    values = [value for value in values if value]
    if not dedupe:
        return values

    deduped_values: list[str] = []
    seen: set[str] = set()
    for value in values:
        if value in seen:
            continue
        seen.add(value)
        deduped_values.append(value)
    return deduped_values


def write_output(values: list[str], output_file: str) -> None:
    content = "\n".join(values)
    if output_file:
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        output_path.write_text(content + ("\n" if values else ""), encoding="utf-8")
        print(f"Saved {len(values)} rows to {output_path}", file=sys.stderr)
        return

    if content:
        sys.stdout.write(content)
        sys.stdout.write("\n")


def write_json_output(
    input_path: Path,
    resolved_sheet_name: str,
    column_name: str,
    values: list[str],
    json_file: str,
) -> Path:
    json_path = Path(json_file) if json_file else build_default_json_path(input_path, column_name)
    json_path.parent.mkdir(parents=True, exist_ok=True)
    payload = {
        "input_file": str(input_path),
        "sheet_name": resolved_sheet_name,
        "column_name": column_name,
        "num_rows": len(values),
        "values": values,
    }
    json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
    print(f"Saved json to {json_path}", file=sys.stderr)
    return json_path


def main() -> int:
    args = parse_args()
    input_path = Path(args.input_file)
    if not input_path.is_file():
        raise FileNotFoundError(f"Input file not found: {input_path}")

    headers, records, resolved_sheet_name = load_table(input_path, args.sheet_name)
    if args.list_columns:
        for header in headers:
            print(header)
        return 0

    values = extract_column(records, args.column_name, args.dedupe)
    write_output(values, args.output_file)
    json_path = write_json_output(input_path, resolved_sheet_name, args.column_name, values, args.json_file)

    sheet_info = f", sheet={resolved_sheet_name}" if resolved_sheet_name else ""
    print(
        f"Extracted {len(values)} rows from column {args.column_name!r} in {input_path}{sheet_info}, json={json_path}",
        file=sys.stderr,
    )
    return 0


if __name__ == "__main__":
    raise SystemExit(main())