Files
HSAP/platform/as_platform/deliveries/scan.py

231 lines
8.6 KiB
Python
Raw Normal View History

"""扫描 inbox / 数据湖目录,与批次台账对齐。"""
from __future__ import annotations
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from as_platform.data.core import load_wf, proj_root, register_batch
from as_platform.db.engine import session_scope
from as_platform.db.models import BatchDelivery, BatchIndex, User
from as_platform.deliveries.service import _new_delivery_id, _normalize_task
def _utcnow() -> datetime:
return datetime.now(timezone.utc)
def _dir_mtime_iso(path: Path) -> str | None:
try:
ts = path.stat().st_mtime
return datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
except OSError:
return None
def _scan_project_inbox(project: str, wf: dict | None = None) -> list[dict[str, Any]]:
from as_platform.data.batch import count_images, count_label_files, dms_has_labels
wf = wf or load_wf()
root = proj_root(wf, project)
inbox = root / "inbox"
if not inbox.is_dir():
return []
with session_scope() as db:
deliveries = {
(r.project, r.task or "", r.mode or "", r.batch_name): r
for r in db.query(BatchDelivery).filter(BatchDelivery.project == project).all()
}
indexed = {
(r.task or "", r.batch)
for r in db.query(BatchIndex).filter(
BatchIndex.project == project,
BatchIndex.archived.is_(False),
).all()
}
items: list[dict[str, Any]] = []
for task_dir in sorted(inbox.iterdir()):
if not task_dir.is_dir():
continue
for batch_dir in sorted(task_dir.iterdir()):
if not batch_dir.is_dir():
continue
task_name = task_dir.name
batch_name = batch_dir.name
img_count = count_images(batch_dir)
if not img_count and (batch_dir / "images").is_dir():
img_count = count_images(batch_dir / "images")
lbl_count = count_label_files(batch_dir / "labels") if (batch_dir / "labels").is_dir() else 0
has_labels = lbl_count > 0 or dms_has_labels(batch_dir)
stage_hint = "returned" if has_labels and lbl_count > 0 else "raw_pool"
key = (project, task_name, "", batch_name)
delivery = deliveries.get(key)
in_index = (task_name, batch_name) in indexed
items.append({
"project": project,
"task": task_name,
"mode": None,
"batch": batch_name,
"batch_name": batch_name,
"path": str(batch_dir),
"data_path": str(batch_dir),
"images": img_count,
"labels": lbl_count,
"has_labels": has_labels,
"stage_hint": stage_hint,
"source_type": "inbox_scan",
"delivery_id": delivery.id if delivery else None,
"delivery_status": delivery.status if delivery else None,
"in_ledger": delivery is not None,
"in_workbench": in_index,
"collection_start": delivery.collection_start if delivery else _dir_mtime_iso(batch_dir),
"collection_end": delivery.collection_end if delivery else None,
"created_at": delivery.created_at.isoformat() if delivery and delivery.created_at else None,
"needs_ledger": delivery is None,
"needs_workbench": not in_index,
})
return items
def scan_delivery_sources(*, projects: list[str] | None = None) -> dict[str, Any]:
"""扫描 inbox返回与台账、工作台对齐状态。"""
projs = projects or ["dms", "adas", "lane"]
wf = load_wf()
items: list[dict[str, Any]] = []
for p in projs:
items.extend(_scan_project_inbox(p, wf))
needs_ledger = sum(1 for i in items if i.get("needs_ledger"))
needs_workbench = sum(1 for i in items if i.get("needs_workbench"))
return {
"items": items,
"count": len(items),
"needs_ledger": needs_ledger,
"needs_workbench": needs_workbench,
"scanned_at": _utcnow().isoformat(),
}
def register_scanned_to_ledger(
items: list[dict[str, Any]],
user: User,
*,
sync_workbench: bool = True,
) -> dict[str, Any]:
"""将扫描结果登记到台账;已在 inbox 的批次直接标为 in_lake 并同步工作台。"""
created = 0
updated = 0
synced = 0
out_items: list[dict[str, Any]] = []
for raw in items:
project = (raw.get("project") or "dms").strip()
task = _normalize_task(project, raw.get("task"))
mode = (raw.get("mode") or "").strip() or None
batch_name = (raw.get("batch_name") or raw.get("batch") or "").strip()
data_path = (raw.get("data_path") or raw.get("path") or "").strip()
if not batch_name or not data_path:
continue
if not Path(data_path).is_dir():
continue
stage_hint = raw.get("stage_hint") or "raw_pool"
collection_start = (raw.get("collection_start") or "").strip() or _dir_mtime_iso(Path(data_path))
collection_end = (raw.get("collection_end") or "").strip() or None
estimated = raw.get("images")
if estimated is None:
estimated = raw.get("estimated_count")
with session_scope() as db:
rec = (
db.query(BatchDelivery)
.filter_by(project=project, task=task, mode=mode, batch_name=batch_name)
.first()
)
if not rec:
rec = BatchDelivery(
id=_new_delivery_id(),
project=project,
task=task,
mode=mode,
batch_name=batch_name,
source_type=(raw.get("source_type") or "inbox_scan"),
collection_start=collection_start,
collection_end=collection_end,
data_path=data_path,
estimated_count=int(estimated) if estimated not in (None, "") else None,
status="in_lake",
inbox_path=data_path,
owner_user_id=user.id,
owner_name=user.name,
submitted_by_user_id=user.id,
submitted_by_name=user.name,
)
db.add(rec)
created += 1
else:
if rec.status in ("draft", "rejected", "ingest_failed"):
rec.status = "in_lake"
if not rec.inbox_path:
rec.inbox_path = data_path
if not rec.data_path:
rec.data_path = data_path
if collection_start and not rec.collection_start:
rec.collection_start = collection_start
if estimated not in (None, "") and not rec.estimated_count:
rec.estimated_count = int(estimated)
if not rec.source_type:
rec.source_type = "inbox_scan"
rec.updated_at = _utcnow()
updated += 1
db.flush()
out_items.append(rec.to_dict())
if sync_workbench and stage_hint in ("raw_pool", "returned"):
try:
register_batch(
None,
project,
task,
batch_name,
stage=stage_hint,
location="inbox",
)
synced += 1
except Exception:
pass
return {
"ok": True,
"created": created,
"updated": updated,
"synced_workbench": synced,
"items": out_items,
}
def bridge_delivery_to_workbench(delivery_id: str) -> dict[str, Any]:
"""台账 in_lake 后同步到送标工作台索引。"""
with session_scope() as db:
rec = db.get(BatchDelivery, delivery_id)
if not rec:
raise ValueError("送标申请不存在")
if rec.status != "in_lake":
raise ValueError(f"当前状态不可同步工作台: {rec.status}")
project = rec.project
task = rec.task
batch_name = rec.batch_name
inbox_path = rec.inbox_path or rec.data_path
stage = "raw_pool"
if inbox_path:
labels_dir = Path(inbox_path) / "labels"
if labels_dir.is_dir() and any(labels_dir.iterdir()):
stage = "returned"
result = register_batch(None, project, task, batch_name, stage=stage, location="inbox")
return {"ok": True, "delivery_id": delivery_id, "batch": result.get("batch")}