Files
HSAP/platform/as_platform/labeling/annotate.py

257 lines
9.0 KiB
Python
Raw Normal View History

"""标注数据湖:批次目录、任务列表、标注 JSON、媒体文件CVAT 为唯一标注引擎)。"""
from __future__ import annotations
import hashlib
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from urllib.parse import quote
from as_platform.data.batch import IMG_EXTS
from as_platform.data.core import load_wf, proj_root, resolve_pack_dir
from as_platform.db.engine import session_scope
from as_platform.db.models import LabelingCampaign, User
from as_platform.labeling.scope import enrich_batch_labels, load_dms_registry
# 历史目录名保留,导出脚本仍读取 labels/ls_annotations/
ANNOTATIONS_DIRNAME = "ls_annotations"
def _load_campaign(campaign_id: str) -> LabelingCampaign | None:
with session_scope() as db:
return db.get(LabelingCampaign, campaign_id)
def resolve_campaign_batch_dir(camp: LabelingCampaign) -> Path:
wf = load_wf()
root = proj_root(wf, camp.project)
if camp.project == "dms":
import yaml
reg = yaml.safe_load((root / wf["projects"]["dms"]["registry"]).read_text(encoding="utf-8"))
tcfg = reg["tasks"][camp.task]
if camp.location == "sources":
if not camp.pack:
raise ValueError("sources 批次需要 pack")
pack_dir = resolve_pack_dir("dms", root, wf, camp.pack)
src_sub = (reg.get("ingest") or {}).get("sources_subdir", "sources")
return (pack_dir / tcfg["task_dir"] / src_sub / camp.batch).resolve()
if tcfg.get("type") == "multi" and camp.mode:
from as_platform.labeling.scope import _dms_registry_api
get_mode_config, resolve_task_id, _ = _dms_registry_api()
task_r, mode_r = resolve_task_id(camp.task, camp.mode)
mcfg = get_mode_config(task_r, mode_r, reg)
inbox_rel = mcfg.get("inbox")
if inbox_rel:
return (root / inbox_rel).resolve()
mode = camp.mode
if mode:
return (root / "inbox" / camp.task / mode / camp.batch).resolve()
return (root / "inbox" / camp.task / camp.batch).resolve()
if camp.project == "adas":
if not camp.task:
raise ValueError("adas campaign 需要 task")
return (root / "inbox" / camp.task / camp.batch).resolve()
if camp.location == "pack" and camp.pack:
try:
from as_platform.data.core import resolve_pack
rel = resolve_pack("lane", root, wf, camp.pack)
return (root / rel).resolve()
except ValueError:
return (root / camp.pack).resolve()
return (root / "inbox" / camp.batch).resolve()
def _iter_batch_images(batch_dir: Path) -> list[Path]:
if not batch_dir.is_dir():
return []
candidates: list[Path] = []
search_roots = [
batch_dir / "images",
batch_dir / "images" / "train",
batch_dir,
]
seen: set[str] = set()
for root in search_roots:
if not root.is_dir():
continue
for p in sorted(root.rglob("*")):
if not p.is_file() or p.suffix not in IMG_EXTS:
continue
key = str(p.resolve())
if key in seen:
continue
seen.add(key)
candidates.append(p.resolve())
return candidates
def _task_id_for_image(image_path: Path, batch_dir: Path) -> str:
try:
rel = image_path.relative_to(batch_dir)
stem = rel.as_posix()
except ValueError:
stem = image_path.stem
return hashlib.sha256(stem.encode()).hexdigest()[:16]
def _annotations_dir(batch_dir: Path) -> Path:
d = batch_dir / "labels" / ANNOTATIONS_DIRNAME
d.mkdir(parents=True, exist_ok=True)
return d
def campaign_bootstrap(campaign_id: str) -> dict[str, Any]:
with session_scope() as db:
camp = db.get(LabelingCampaign, campaign_id)
if not camp:
raise FileNotFoundError("campaign not found")
reg = load_dms_registry() if camp.project == "dms" else None
row = enrich_batch_labels(camp.to_dict(), reg)
try:
batch_dir = resolve_campaign_batch_dir(camp)
row["batch_path"] = str(batch_dir)
row["image_count"] = len(_iter_batch_images(batch_dir))
except Exception as e:
row["batch_path"] = None
row["image_count"] = 0
row["batch_error"] = str(e)
row["editor"] = "cvat"
row["cvat_task_id"] = camp.cvat_task_id
row["cvat_job_url"] = camp.cvat_job_url
return row
def campaign_tasks(
campaign_id: str,
*,
offset: int = 0,
limit: int = 50,
user: User | None = None,
assignee: str | None = None,
) -> dict[str, Any]:
with session_scope() as db:
camp = db.get(LabelingCampaign, campaign_id)
if not camp:
raise FileNotFoundError("campaign not found")
batch_dir = resolve_campaign_batch_dir(camp)
images = _iter_batch_images(batch_dir)
from as_platform.labeling.progress import get_assigned_task_ids, user_is_coordinator
filter_ids: set[str] | None = None
if assignee == "me" and user:
filter_ids = get_assigned_task_ids(campaign_id, user.id)
if not filter_ids and not user_is_coordinator(user):
return {
"tasks": [],
"total": 0,
"offset": offset,
"limit": limit,
"hint": "暂无分配给您的任务,请联系协调员在送标工作台均分任务",
}
if filter_ids is not None:
filtered = [img for img in images if _task_id_for_image(img, batch_dir) in filter_ids]
images = filtered
total = len(images)
slice_imgs = images[offset : offset + limit]
tasks: list[dict[str, Any]] = []
for img in slice_imgs:
tid = _task_id_for_image(img, batch_dir)
try:
rel = img.relative_to(batch_dir).as_posix()
except ValueError:
rel = img.name
media_path = quote(rel, safe="/")
tasks.append(
{
"id": tid,
"data": {
"image": f"/api/v1/labeling/media/{campaign_id}/{media_path}",
},
"meta": {"filename": img.name, "relative_path": rel},
}
)
out: dict[str, Any] = {"tasks": tasks, "total": total, "offset": offset, "limit": limit}
if filter_ids is not None and user and assignee == "me":
out["my_assigned"] = len(filter_ids)
return out
def resolve_media_file(campaign_id: str, rel_path: str) -> Path:
with session_scope() as db:
camp = db.get(LabelingCampaign, campaign_id)
if not camp:
raise FileNotFoundError("campaign not found")
batch_dir = resolve_campaign_batch_dir(camp)
clean = Path(rel_path)
if clean.is_absolute() or ".." in clean.parts:
raise PermissionError("invalid path")
target = (batch_dir / clean).resolve()
if not target.is_file() or not target.is_relative_to(batch_dir.resolve()):
raise FileNotFoundError("media not found")
return target
def get_annotation(campaign_id: str, task_id: str) -> dict[str, Any]:
with session_scope() as db:
camp = db.get(LabelingCampaign, campaign_id)
if not camp:
raise FileNotFoundError("campaign not found")
batch_dir = resolve_campaign_batch_dir(camp)
path = _annotations_dir(batch_dir) / f"{task_id}.json"
if not path.is_file():
return {"task_id": task_id, "result": None, "annotations": []}
data = json.loads(path.read_text(encoding="utf-8"))
return data
def save_annotation(
campaign_id: str,
task_id: str,
payload: dict[str, Any],
*,
user: User | None = None,
) -> dict[str, Any]:
from as_platform.labeling.progress import assert_can_save_task, mark_task_completed
if user:
assert_can_save_task(campaign_id, task_id, user)
with session_scope() as db:
camp = db.get(LabelingCampaign, campaign_id)
if not camp:
raise FileNotFoundError("campaign not found")
batch_dir = resolve_campaign_batch_dir(camp)
path = _annotations_dir(batch_dir) / f"{task_id}.json"
now = datetime.now(timezone.utc).isoformat()
extra: dict[str, Any] = {"source": "hsap", "saved_at": now}
if user:
extra["completed_by_user_id"] = user.id
extra["completed_at"] = now
out = {"task_id": task_id, **payload, **extra}
path.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
if user and _annotation_has_result(path):
mark_task_completed(campaign_id, task_id, user.id)
return {"ok": True, "path": str(path)}
def _annotation_has_result(path: Path) -> bool:
if not path.is_file():
return False
try:
data = json.loads(path.read_text(encoding="utf-8"))
except (json.JSONDecodeError, OSError):
return False
result = data.get("result")
if result is None:
return False
if isinstance(result, list):
return len(result) > 0
if isinstance(result, dict):
return len(result) > 0
return bool(result)