Files
yolov26_3d/tools/model_inference/data_tools/extract_excel_column.py
2026-06-24 09:35:46 +08:00

235 lines
7.8 KiB
Python
Executable File

from __future__ import annotations
import argparse
import csv
import json
import re
import sys
from pathlib import Path
try:
from openpyxl import load_workbook
except ImportError:
load_workbook = None
FILE = Path(__file__).resolve()
DEFAULT_INPUT_FILE = FILE.parents[1] / "examples" / "cncap" / "G1M3_AFS1616_CNCAP-2024_11月_0306.xlsx"
DEFAULT_COLUMN_NAME = "原始数据地址"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Extract one column from a CSV/XLSX table and print or save the values."
)
parser.add_argument(
"--input-file",
type=str,
default=str(DEFAULT_INPUT_FILE),
help="Path to the input .xlsx or .csv file.",
)
parser.add_argument(
"--column-name",
type=str,
default=DEFAULT_COLUMN_NAME,
help="Header name of the target column.",
)
parser.add_argument(
"--sheet-name",
type=str,
default="",
help="Worksheet name for .xlsx files. Defaults to the first sheet.",
)
parser.add_argument(
"--output-file",
type=str,
default="",
help="Optional output text file. If omitted, values are written to stdout.",
)
parser.add_argument(
"--json-file",
type=str,
default="",
help="Optional output json file. Defaults to a sibling json file next to the input table.",
)
parser.add_argument(
"--dedupe",
action="store_true",
help="Remove duplicate values while preserving the original order.",
)
parser.add_argument(
"--list-columns",
action="store_true",
help="List the discovered header names and exit.",
)
return parser.parse_args()
def normalize_header(value: str) -> str:
return re.sub(r"\s+", "", str(value or "")).strip()
def sanitize_filename(value: str) -> str:
sanitized = re.sub(r'[\\/:*?"<>|]+', "_", str(value or "").strip())
sanitized = re.sub(r"\s+", "_", sanitized)
return sanitized.strip("._") or "column"
def build_default_json_path(input_path: Path, column_name: str) -> Path:
return input_path.with_name(f"{input_path.stem}_{sanitize_filename(column_name)}.json")
def load_xlsx_table(path: Path, sheet_name: str) -> tuple[list[str], list[dict[str, str]], str]:
if load_workbook is None:
raise ImportError("openpyxl is required for .xlsx files. Please install it first.")
workbook = load_workbook(path, read_only=True, data_only=True)
try:
if sheet_name:
if sheet_name not in workbook.sheetnames:
available = ", ".join(workbook.sheetnames)
raise ValueError(f"Worksheet {sheet_name!r} not found. Available sheets: {available}")
worksheet = workbook[sheet_name]
else:
worksheet = workbook[workbook.sheetnames[0]]
header_row_values: list[str] | None = None
header_indices: list[int] = []
records: list[dict[str, str]] = []
for row in worksheet.iter_rows(values_only=True):
normalized_row = [str(value).strip() if value is not None else "" for value in row]
if not any(normalized_row):
continue
if header_row_values is None:
header_indices = [index for index, value in enumerate(normalized_row) if value]
header_row_values = [normalized_row[index] for index in header_indices]
continue
record = {
header_row_values[index]: normalized_row[col_idx] if col_idx < len(normalized_row) else ""
for index, col_idx in enumerate(header_indices)
}
if any(record.values()):
records.append(record)
if header_row_values is None:
raise ValueError("The worksheet is empty.")
return header_row_values, records, worksheet.title
finally:
workbook.close()
def load_csv_table(path: Path) -> tuple[list[str], list[dict[str, str]], str]:
with path.open("r", encoding="utf-8-sig", newline="") as file:
sample = file.read(4096)
file.seek(0)
dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
reader = csv.DictReader(file, dialect=dialect)
headers = reader.fieldnames or []
records = []
for row in reader:
normalized_row = {str(key).strip(): str(value or "").strip() for key, value in row.items() if key is not None}
if any(normalized_row.values()):
records.append(normalized_row)
return [str(header).strip() for header in headers], records, ""
def load_table(path: Path, sheet_name: str) -> tuple[list[str], list[dict[str, str]], str]:
suffix = path.suffix.lower()
if suffix == ".xlsx":
return load_xlsx_table(path, sheet_name)
if suffix == ".csv":
return load_csv_table(path)
raise ValueError(f"Unsupported input format: {suffix}. Only .xlsx and .csv are supported.")
def extract_column(records: list[dict[str, str]], column_name: str, dedupe: bool) -> list[str]:
if not records:
return []
normalized_to_actual = {normalize_header(name): name for name in records[0].keys()}
target_key = normalized_to_actual.get(normalize_header(column_name))
if target_key is None:
available = ", ".join(records[0].keys())
raise ValueError(f"Column {column_name!r} not found. Available columns: {available}")
values = [record.get(target_key, "").strip() for record in records]
values = [value for value in values if value]
if not dedupe:
return values
deduped_values: list[str] = []
seen: set[str] = set()
for value in values:
if value in seen:
continue
seen.add(value)
deduped_values.append(value)
return deduped_values
def write_output(values: list[str], output_file: str) -> None:
content = "\n".join(values)
if output_file:
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(content + ("\n" if values else ""), encoding="utf-8")
print(f"Saved {len(values)} rows to {output_path}", file=sys.stderr)
return
if content:
sys.stdout.write(content)
sys.stdout.write("\n")
def write_json_output(
input_path: Path,
resolved_sheet_name: str,
column_name: str,
values: list[str],
json_file: str,
) -> Path:
json_path = Path(json_file) if json_file else build_default_json_path(input_path, column_name)
json_path.parent.mkdir(parents=True, exist_ok=True)
payload = {
"input_file": str(input_path),
"sheet_name": resolved_sheet_name,
"column_name": column_name,
"num_rows": len(values),
"values": values,
}
json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
print(f"Saved json to {json_path}", file=sys.stderr)
return json_path
def main() -> int:
args = parse_args()
input_path = Path(args.input_file)
if not input_path.is_file():
raise FileNotFoundError(f"Input file not found: {input_path}")
headers, records, resolved_sheet_name = load_table(input_path, args.sheet_name)
if args.list_columns:
for header in headers:
print(header)
return 0
values = extract_column(records, args.column_name, args.dedupe)
write_output(values, args.output_file)
json_path = write_json_output(input_path, resolved_sheet_name, args.column_name, values, args.json_file)
sheet_info = f", sheet={resolved_sheet_name}" if resolved_sheet_name else ""
print(
f"Extracted {len(values)} rows from column {args.column_name!r} in {input_path}{sheet_info}, json={json_path}",
file=sys.stderr,
)
return 0
if __name__ == "__main__":
raise SystemExit(main())