235 lines
7.8 KiB
Python
Executable File
235 lines
7.8 KiB
Python
Executable File
from __future__ import annotations
|
|
|
|
import argparse
|
|
import csv
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
try:
|
|
from openpyxl import load_workbook
|
|
except ImportError:
|
|
load_workbook = None
|
|
|
|
|
|
FILE = Path(__file__).resolve()
|
|
DEFAULT_INPUT_FILE = FILE.parents[1] / "examples" / "cncap" / "G1M3_AFS1616_CNCAP-2024_11月_0306.xlsx"
|
|
DEFAULT_COLUMN_NAME = "原始数据地址"
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract one column from a CSV/XLSX table and print or save the values."
|
|
)
|
|
parser.add_argument(
|
|
"--input-file",
|
|
type=str,
|
|
default=str(DEFAULT_INPUT_FILE),
|
|
help="Path to the input .xlsx or .csv file.",
|
|
)
|
|
parser.add_argument(
|
|
"--column-name",
|
|
type=str,
|
|
default=DEFAULT_COLUMN_NAME,
|
|
help="Header name of the target column.",
|
|
)
|
|
parser.add_argument(
|
|
"--sheet-name",
|
|
type=str,
|
|
default="",
|
|
help="Worksheet name for .xlsx files. Defaults to the first sheet.",
|
|
)
|
|
parser.add_argument(
|
|
"--output-file",
|
|
type=str,
|
|
default="",
|
|
help="Optional output text file. If omitted, values are written to stdout.",
|
|
)
|
|
parser.add_argument(
|
|
"--json-file",
|
|
type=str,
|
|
default="",
|
|
help="Optional output json file. Defaults to a sibling json file next to the input table.",
|
|
)
|
|
parser.add_argument(
|
|
"--dedupe",
|
|
action="store_true",
|
|
help="Remove duplicate values while preserving the original order.",
|
|
)
|
|
parser.add_argument(
|
|
"--list-columns",
|
|
action="store_true",
|
|
help="List the discovered header names and exit.",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def normalize_header(value: str) -> str:
|
|
return re.sub(r"\s+", "", str(value or "")).strip()
|
|
|
|
|
|
def sanitize_filename(value: str) -> str:
|
|
sanitized = re.sub(r'[\\/:*?"<>|]+', "_", str(value or "").strip())
|
|
sanitized = re.sub(r"\s+", "_", sanitized)
|
|
return sanitized.strip("._") or "column"
|
|
|
|
|
|
def build_default_json_path(input_path: Path, column_name: str) -> Path:
|
|
return input_path.with_name(f"{input_path.stem}_{sanitize_filename(column_name)}.json")
|
|
|
|
|
|
def load_xlsx_table(path: Path, sheet_name: str) -> tuple[list[str], list[dict[str, str]], str]:
|
|
if load_workbook is None:
|
|
raise ImportError("openpyxl is required for .xlsx files. Please install it first.")
|
|
|
|
workbook = load_workbook(path, read_only=True, data_only=True)
|
|
try:
|
|
if sheet_name:
|
|
if sheet_name not in workbook.sheetnames:
|
|
available = ", ".join(workbook.sheetnames)
|
|
raise ValueError(f"Worksheet {sheet_name!r} not found. Available sheets: {available}")
|
|
worksheet = workbook[sheet_name]
|
|
else:
|
|
worksheet = workbook[workbook.sheetnames[0]]
|
|
|
|
header_row_values: list[str] | None = None
|
|
header_indices: list[int] = []
|
|
records: list[dict[str, str]] = []
|
|
|
|
for row in worksheet.iter_rows(values_only=True):
|
|
normalized_row = [str(value).strip() if value is not None else "" for value in row]
|
|
if not any(normalized_row):
|
|
continue
|
|
|
|
if header_row_values is None:
|
|
header_indices = [index for index, value in enumerate(normalized_row) if value]
|
|
header_row_values = [normalized_row[index] for index in header_indices]
|
|
continue
|
|
|
|
record = {
|
|
header_row_values[index]: normalized_row[col_idx] if col_idx < len(normalized_row) else ""
|
|
for index, col_idx in enumerate(header_indices)
|
|
}
|
|
if any(record.values()):
|
|
records.append(record)
|
|
|
|
if header_row_values is None:
|
|
raise ValueError("The worksheet is empty.")
|
|
|
|
return header_row_values, records, worksheet.title
|
|
finally:
|
|
workbook.close()
|
|
|
|
|
|
def load_csv_table(path: Path) -> tuple[list[str], list[dict[str, str]], str]:
|
|
with path.open("r", encoding="utf-8-sig", newline="") as file:
|
|
sample = file.read(4096)
|
|
file.seek(0)
|
|
dialect = csv.Sniffer().sniff(sample) if sample.strip() else csv.excel
|
|
reader = csv.DictReader(file, dialect=dialect)
|
|
headers = reader.fieldnames or []
|
|
records = []
|
|
for row in reader:
|
|
normalized_row = {str(key).strip(): str(value or "").strip() for key, value in row.items() if key is not None}
|
|
if any(normalized_row.values()):
|
|
records.append(normalized_row)
|
|
return [str(header).strip() for header in headers], records, ""
|
|
|
|
|
|
def load_table(path: Path, sheet_name: str) -> tuple[list[str], list[dict[str, str]], str]:
|
|
suffix = path.suffix.lower()
|
|
if suffix == ".xlsx":
|
|
return load_xlsx_table(path, sheet_name)
|
|
if suffix == ".csv":
|
|
return load_csv_table(path)
|
|
raise ValueError(f"Unsupported input format: {suffix}. Only .xlsx and .csv are supported.")
|
|
|
|
|
|
def extract_column(records: list[dict[str, str]], column_name: str, dedupe: bool) -> list[str]:
|
|
if not records:
|
|
return []
|
|
|
|
normalized_to_actual = {normalize_header(name): name for name in records[0].keys()}
|
|
target_key = normalized_to_actual.get(normalize_header(column_name))
|
|
if target_key is None:
|
|
available = ", ".join(records[0].keys())
|
|
raise ValueError(f"Column {column_name!r} not found. Available columns: {available}")
|
|
|
|
values = [record.get(target_key, "").strip() for record in records]
|
|
values = [value for value in values if value]
|
|
if not dedupe:
|
|
return values
|
|
|
|
deduped_values: list[str] = []
|
|
seen: set[str] = set()
|
|
for value in values:
|
|
if value in seen:
|
|
continue
|
|
seen.add(value)
|
|
deduped_values.append(value)
|
|
return deduped_values
|
|
|
|
|
|
def write_output(values: list[str], output_file: str) -> None:
|
|
content = "\n".join(values)
|
|
if output_file:
|
|
output_path = Path(output_file)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
output_path.write_text(content + ("\n" if values else ""), encoding="utf-8")
|
|
print(f"Saved {len(values)} rows to {output_path}", file=sys.stderr)
|
|
return
|
|
|
|
if content:
|
|
sys.stdout.write(content)
|
|
sys.stdout.write("\n")
|
|
|
|
|
|
def write_json_output(
|
|
input_path: Path,
|
|
resolved_sheet_name: str,
|
|
column_name: str,
|
|
values: list[str],
|
|
json_file: str,
|
|
) -> Path:
|
|
json_path = Path(json_file) if json_file else build_default_json_path(input_path, column_name)
|
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
|
payload = {
|
|
"input_file": str(input_path),
|
|
"sheet_name": resolved_sheet_name,
|
|
"column_name": column_name,
|
|
"num_rows": len(values),
|
|
"values": values,
|
|
}
|
|
json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
|
print(f"Saved json to {json_path}", file=sys.stderr)
|
|
return json_path
|
|
|
|
|
|
def main() -> int:
|
|
args = parse_args()
|
|
input_path = Path(args.input_file)
|
|
if not input_path.is_file():
|
|
raise FileNotFoundError(f"Input file not found: {input_path}")
|
|
|
|
headers, records, resolved_sheet_name = load_table(input_path, args.sheet_name)
|
|
if args.list_columns:
|
|
for header in headers:
|
|
print(header)
|
|
return 0
|
|
|
|
values = extract_column(records, args.column_name, args.dedupe)
|
|
write_output(values, args.output_file)
|
|
json_path = write_json_output(input_path, resolved_sheet_name, args.column_name, values, args.json_file)
|
|
|
|
sheet_info = f", sheet={resolved_sheet_name}" if resolved_sheet_name else ""
|
|
print(
|
|
f"Extracted {len(values)} rows from column {args.column_name!r} in {input_path}{sheet_info}, json={json_path}",
|
|
file=sys.stderr,
|
|
)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|