"""扫描 inbox / 数据湖目录,与批次台账对齐。""" from __future__ import annotations from datetime import datetime, timezone from pathlib import Path from typing import Any from as_platform.data.core import load_wf, proj_root, register_batch from as_platform.db.engine import session_scope from as_platform.db.models import BatchDelivery, BatchIndex, User from as_platform.deliveries.service import _new_delivery_id, _normalize_task def _utcnow() -> datetime: return datetime.now(timezone.utc) def _dir_mtime_iso(path: Path) -> str | None: try: ts = path.stat().st_mtime return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d") except OSError: return None def _scan_project_inbox(project: str, wf: dict | None = None) -> list[dict[str, Any]]: from as_platform.data.batch import count_images, count_label_files, dms_has_labels wf = wf or load_wf() root = proj_root(wf, project) inbox = root / "inbox" if not inbox.is_dir(): return [] with session_scope() as db: deliveries = { (r.project, r.task or "", r.mode or "", r.batch_name): r for r in db.query(BatchDelivery).filter(BatchDelivery.project == project).all() } indexed = { (r.task or "", r.batch) for r in db.query(BatchIndex).filter( BatchIndex.project == project, BatchIndex.archived.is_(False), ).all() } items: list[dict[str, Any]] = [] for task_dir in sorted(inbox.iterdir()): if not task_dir.is_dir(): continue for batch_dir in sorted(task_dir.iterdir()): if not batch_dir.is_dir(): continue task_name = task_dir.name batch_name = batch_dir.name img_count = count_images(batch_dir) if not img_count and (batch_dir / "images").is_dir(): img_count = count_images(batch_dir / "images") lbl_count = count_label_files(batch_dir / "labels") if (batch_dir / "labels").is_dir() else 0 has_labels = lbl_count > 0 or dms_has_labels(batch_dir) stage_hint = "returned" if has_labels and lbl_count > 0 else "raw_pool" key = (project, task_name, "", batch_name) delivery = deliveries.get(key) in_index = (task_name, batch_name) in indexed items.append({ "project": project, "task": task_name, "mode": None, "batch": batch_name, "batch_name": batch_name, "path": str(batch_dir), "data_path": str(batch_dir), "images": img_count, "labels": lbl_count, "has_labels": has_labels, "stage_hint": stage_hint, "source_type": "inbox_scan", "delivery_id": delivery.id if delivery else None, "delivery_status": delivery.status if delivery else None, "in_ledger": delivery is not None, "in_workbench": in_index, "collection_start": delivery.collection_start if delivery else _dir_mtime_iso(batch_dir), "collection_end": delivery.collection_end if delivery else None, "created_at": delivery.created_at.isoformat() if delivery and delivery.created_at else None, "needs_ledger": delivery is None, "needs_workbench": not in_index, }) return items def scan_delivery_sources(*, projects: list[str] | None = None) -> dict[str, Any]: """扫描 inbox,返回与台账、工作台对齐状态。""" projs = projects or ["dms", "adas", "lane"] wf = load_wf() items: list[dict[str, Any]] = [] for p in projs: items.extend(_scan_project_inbox(p, wf)) needs_ledger = sum(1 for i in items if i.get("needs_ledger")) needs_workbench = sum(1 for i in items if i.get("needs_workbench")) return { "items": items, "count": len(items), "needs_ledger": needs_ledger, "needs_workbench": needs_workbench, "scanned_at": _utcnow().isoformat(), } def register_scanned_to_ledger( items: list[dict[str, Any]], user: User, *, sync_workbench: bool = True, ) -> dict[str, Any]: """将扫描结果登记到台账;已在 inbox 的批次直接标为 in_lake 并同步工作台。""" created = 0 updated = 0 synced = 0 out_items: list[dict[str, Any]] = [] for raw in items: project = (raw.get("project") or "dms").strip() task = _normalize_task(project, raw.get("task")) mode = (raw.get("mode") or "").strip() or None batch_name = (raw.get("batch_name") or raw.get("batch") or "").strip() data_path = (raw.get("data_path") or raw.get("path") or "").strip() if not batch_name or not data_path: continue if not Path(data_path).is_dir(): continue stage_hint = raw.get("stage_hint") or "raw_pool" collection_start = (raw.get("collection_start") or "").strip() or _dir_mtime_iso(Path(data_path)) collection_end = (raw.get("collection_end") or "").strip() or None estimated = raw.get("images") if estimated is None: estimated = raw.get("estimated_count") with session_scope() as db: rec = ( db.query(BatchDelivery) .filter_by(project=project, task=task, mode=mode, batch_name=batch_name) .first() ) if not rec: rec = BatchDelivery( id=_new_delivery_id(), project=project, task=task, mode=mode, batch_name=batch_name, source_type=(raw.get("source_type") or "inbox_scan"), collection_start=collection_start, collection_end=collection_end, data_path=data_path, estimated_count=int(estimated) if estimated not in (None, "") else None, status="in_lake", inbox_path=data_path, owner_user_id=user.id, owner_name=user.name, submitted_by_user_id=user.id, submitted_by_name=user.name, ) db.add(rec) created += 1 else: if rec.status in ("draft", "rejected", "ingest_failed"): rec.status = "in_lake" if not rec.inbox_path: rec.inbox_path = data_path if not rec.data_path: rec.data_path = data_path if collection_start and not rec.collection_start: rec.collection_start = collection_start if estimated not in (None, "") and not rec.estimated_count: rec.estimated_count = int(estimated) if not rec.source_type: rec.source_type = "inbox_scan" rec.updated_at = _utcnow() updated += 1 db.flush() out_items.append(rec.to_dict()) if sync_workbench and stage_hint in ("raw_pool", "returned"): try: register_batch( None, project, task, batch_name, stage=stage_hint, location="inbox", ) synced += 1 except Exception: pass return { "ok": True, "created": created, "updated": updated, "synced_workbench": synced, "items": out_items, } def bridge_delivery_to_workbench(delivery_id: str) -> dict[str, Any]: """台账 in_lake 后同步到送标工作台索引。""" with session_scope() as db: rec = db.get(BatchDelivery, delivery_id) if not rec: raise ValueError("送标申请不存在") if rec.status != "in_lake": raise ValueError(f"当前状态不可同步工作台: {rec.status}") project = rec.project task = rec.task batch_name = rec.batch_name inbox_path = rec.inbox_path or rec.data_path stage = "raw_pool" if inbox_path: labels_dir = Path(inbox_path) / "labels" if labels_dir.is_dir() and any(labels_dir.iterdir()): stage = "returned" result = register_batch(None, project, task, batch_name, stage=stage, location="inbox") return {"ok": True, "delivery_id": delivery_id, "batch": result.get("batch")}