408 lines
18 KiB
Python
408 lines
18 KiB
Python
"""Tests for the ImageHub dataset routes (milestone 1 walking skeleton).
|
|
|
|
Pure-helper unit tests always run. The full integration test (create dataset → upload with
|
|
content-addressed dedup → version snapshot → owner/admin authz → audit) runs only when BOTH:
|
|
- INITIATIVE_DATABASE_URL points at PostgreSQL (asyncpg), and
|
|
- S3_ENDPOINT_URL is set (a reachable MinIO; the dev stack maps it to http://localhost:19000).
|
|
|
|
export INITIATIVE_DATABASE_URL="postgresql+asyncpg://initiative:initiative_secret@127.0.0.1:15432/initiatives"
|
|
export S3_ENDPOINT_URL="http://localhost:19000" S3_ACCESS_KEY=minio_user S3_SECRET_KEY=minio_password \\
|
|
S3_BUCKET_ATTACHMENTS=initiative-attachments S3_BUCKET_EXPORTS=initiative-exports \\
|
|
S3_BUCKET_QUARANTINE=initiative-quarantine S3_PUBLIC_ENDPOINT_URL=http://localhost:19000
|
|
cd be0 && python -m unittest tests.test_imagehub_datasets -v
|
|
|
|
Prereq for the integration test: migration 017_imagehub_datasets.sql applied (compose init mount
|
|
or scripts/apply_initiative_migrations.py).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import os
|
|
import unittest
|
|
import uuid
|
|
|
|
_RUN_DB = os.getenv("INITIATIVE_DATABASE_URL", "").strip().lower().startswith("postgresql")
|
|
_RUN_S3 = bool(os.getenv("S3_ENDPOINT_URL", "").strip())
|
|
|
|
# Let the module (which imports src.minio.storage → S3Settings()) import even when not running
|
|
# against a real MinIO, so the pure-unit tests below can always run. These defaults match the
|
|
# dev stack's host-mapped MinIO; the integration test only fires when S3_ENDPOINT_URL was set.
|
|
os.environ.setdefault("S3_ENDPOINT_URL", "http://localhost:19000")
|
|
os.environ.setdefault("S3_ACCESS_KEY", "minio_user")
|
|
os.environ.setdefault("S3_SECRET_KEY", "minio_password")
|
|
os.environ.setdefault("S3_BUCKET_ATTACHMENTS", "initiative-attachments")
|
|
os.environ.setdefault("S3_BUCKET_EXPORTS", "initiative-exports")
|
|
os.environ.setdefault("S3_BUCKET_QUARANTINE", "initiative-quarantine")
|
|
os.environ.setdefault("S3_PUBLIC_ENDPOINT_URL", "http://localhost:19000")
|
|
|
|
|
|
class PureHelperTests(unittest.TestCase):
|
|
"""No DB / no network — string + sniff helpers."""
|
|
|
|
def test_build_blob_key_is_content_addressed(self) -> None:
|
|
from src.minio.storage import S3Storage
|
|
|
|
key = S3Storage.build_blob_key("AbCdEf0123456789")
|
|
self.assertEqual(key, "blobs/ab/cd/abcdef0123456789")
|
|
|
|
def test_slugify_strips_diacritics_and_punct(self) -> None:
|
|
from src.imagehub_routes import _slugify
|
|
|
|
self.assertEqual(_slugify("Bộ dữ liệu CT Ngực!! 2026"), "bo-du-lieu-ct-nguc-2026")
|
|
self.assertEqual(_slugify(""), "dataset")
|
|
|
|
def test_safe_logical_path_basename_only(self) -> None:
|
|
from src.imagehub_routes import _safe_logical_path
|
|
|
|
self.assertEqual(_safe_logical_path("/evil/../a b.dcm"), "a_b.dcm")
|
|
self.assertEqual(_safe_logical_path("C:\\scans\\series1.nii.gz"), "series1.nii.gz")
|
|
self.assertEqual(_safe_logical_path(""), "file")
|
|
|
|
def test_safe_folder_path_preserves_dirs_rejects_traversal(self) -> None:
|
|
from src.imagehub_routes import _safe_folder_path
|
|
|
|
# the directory is kept (basename dropped) so an uploaded tree round-trips
|
|
self.assertEqual(_safe_folder_path("imagesTr/ct_001.nii.gz"), "imagesTr")
|
|
self.assertEqual(_safe_folder_path("a/b/c/scan.nii.gz"), "a/b/c")
|
|
# no directory component → dataset root
|
|
self.assertEqual(_safe_folder_path("readme.txt"), "")
|
|
self.assertEqual(_safe_folder_path(""), "")
|
|
# leading slash + ".." traversal segments are stripped
|
|
self.assertEqual(_safe_folder_path("/evil/../x/y.dcm"), "evil/x")
|
|
# backslashes normalise to forward slashes
|
|
self.assertEqual(_safe_folder_path("labelsTr\\sub\\m.nii.gz"), "labelsTr/sub")
|
|
|
|
def test_coerce_tags(self) -> None:
|
|
from src.imagehub_routes import _coerce_tags
|
|
|
|
self.assertEqual(_coerce_tags(["CT", " MRI ", "", 7]), ["CT", "MRI", "7"])
|
|
self.assertEqual(_coerce_tags("nope"), [])
|
|
|
|
def test_coerce_label_map(self) -> None:
|
|
from src.imagehub_routes import _coerce_label_map
|
|
|
|
# valid entries kept + trimmed; non-positive / non-int keys and empty/non-str values dropped
|
|
self.assertEqual(
|
|
_coerce_label_map(
|
|
{"1": " kidney ", "2": "tumor", "0": "bad", "-3": "bad", "x": "bad", "4": "", "+5": "bad", "1_0": "bad"}
|
|
),
|
|
{"1": "kidney", "2": "tumor"},
|
|
)
|
|
# integer keys coerce to strings; non-dict input → {}
|
|
self.assertEqual(_coerce_label_map({1: "kidney"}), {"1": "kidney"})
|
|
self.assertEqual(_coerce_label_map("nope"), {})
|
|
self.assertEqual(_coerce_label_map(None), {})
|
|
|
|
def test_sniff_never_raises_on_non_imaging(self) -> None:
|
|
from src.imagehub_routes import _sniff_imaging_meta
|
|
|
|
# plain bytes → {}; a .dcm name with junk must degrade to {} (never raise)
|
|
self.assertEqual(_sniff_imaging_meta("notes.txt", b"hello world", "text/plain"), {})
|
|
self.assertIsInstance(_sniff_imaging_meta("x.dcm", b"DICM" + b"\x00" * 200, "application/dicom"), dict)
|
|
|
|
|
|
def _bearer(uid: uuid.UUID, roles: list[str]) -> str:
|
|
import jwt
|
|
|
|
from src.auth_jwt import jwt_secret
|
|
|
|
return "Bearer " + jwt.encode({"sub": str(uid), "roles": roles, "cv": 0}, jwt_secret(), algorithm="HS256")
|
|
|
|
|
|
def _upload(name: str, data: bytes, ctype: str = "application/octet-stream"):
|
|
from starlette.datastructures import Headers, UploadFile
|
|
|
|
return UploadFile(io.BytesIO(data), filename=name, headers=Headers({"content-type": ctype}))
|
|
|
|
|
|
@unittest.skipUnless(
|
|
_RUN_DB and _RUN_S3,
|
|
"Set INITIATIVE_DATABASE_URL=postgresql+asyncpg://… and S3_ENDPOINT_URL=… to run the integration test",
|
|
)
|
|
class ImagehubDatasetDbTests(unittest.IsolatedAsyncioTestCase):
|
|
"""End-to-end: create → upload (content-addressed dedup) → version → owner/admin authz → audit."""
|
|
|
|
async def asyncSetUp(self) -> None:
|
|
from src.initiative_db import engine as eng
|
|
from src.minio.storage import storage
|
|
|
|
await eng.dispose_engine()
|
|
await eng.init_engine()
|
|
try:
|
|
await storage.ensure_buckets_exist()
|
|
except Exception as exc: # MinIO not reachable → skip rather than error
|
|
self.skipTest(f"MinIO not reachable: {exc}")
|
|
self._user_ids: list[uuid.UUID] = []
|
|
self._dataset_ids: list[uuid.UUID] = []
|
|
|
|
async def asyncTearDown(self) -> None:
|
|
from sqlalchemy import delete
|
|
|
|
from src.initiative_db import engine as eng
|
|
from src.initiative_db.engine import get_session
|
|
from src.initiative_db.models import ImagehubDataset, User
|
|
|
|
async with get_session() as session:
|
|
for did in self._dataset_ids:
|
|
await session.execute(delete(ImagehubDataset).where(ImagehubDataset.id == did))
|
|
for uid in self._user_ids:
|
|
await session.execute(delete(User).where(User.id == uid))
|
|
await session.commit()
|
|
await eng.dispose_engine()
|
|
|
|
async def _seed_user(self, *, admin: bool = False) -> uuid.UUID:
|
|
from src.initiative_db.engine import get_session
|
|
from src.initiative_db.models import User
|
|
|
|
uid = uuid.uuid4()
|
|
async with get_session() as session:
|
|
session.add(
|
|
User(
|
|
id=uid,
|
|
email=f"ih-{uid.hex[:10]}@ump.edu.vn",
|
|
password_hash="x",
|
|
full_name=("Quản trị" if admin else "Nhà nghiên cứu") + " Test",
|
|
)
|
|
)
|
|
await session.commit()
|
|
self._user_ids.append(uid)
|
|
return uid
|
|
|
|
async def test_dataset_research_project_link(self) -> None:
|
|
"""A dataset can be created linked to a research project ("workspace"); the list can be
|
|
filtered to that project; bad/foreign project ids are rejected (migration 024)."""
|
|
from fastapi import HTTPException
|
|
|
|
from src.imagehub_routes import DatasetCreateIn, create_dataset, list_datasets
|
|
from src.initiative_db.engine import get_session
|
|
from src.initiative_db.models import ResearchProject
|
|
|
|
owner = await self._seed_user()
|
|
owner_tok = _bearer(owner, ["viewer"])
|
|
|
|
# seed a research project ("workspace") owned by the user (cascade-cleaned with the user)
|
|
proj_id = uuid.uuid4()
|
|
async with get_session() as session:
|
|
session.add(ResearchProject(id=proj_id, owner_user_id=owner, title="Đề tài thử nghiệm"))
|
|
await session.commit()
|
|
|
|
# create a dataset linked to the project → the link is persisted
|
|
ds = await create_dataset(
|
|
DatasetCreateIn(name="Bộ dữ liệu thuộc đề tài", researchProjectId=str(proj_id)),
|
|
owner_tok,
|
|
)
|
|
self._dataset_ids.append(uuid.UUID(ds.id))
|
|
self.assertEqual(ds.researchProjectId, str(proj_id))
|
|
|
|
# a standalone dataset (no project) is still allowed and stays unlinked
|
|
ds2 = await create_dataset(DatasetCreateIn(name="Bộ dữ liệu độc lập"), owner_tok)
|
|
self._dataset_ids.append(uuid.UUID(ds2.id))
|
|
self.assertIsNone(ds2.researchProjectId)
|
|
|
|
# a non-existent project id is rejected (422)
|
|
with self.assertRaises(HTTPException) as ctx:
|
|
await create_dataset(
|
|
DatasetCreateIn(name="x", researchProjectId=str(uuid.uuid4())), owner_tok
|
|
)
|
|
self.assertEqual(ctx.exception.status_code, 422)
|
|
|
|
# ?projectId= filters the list to that project only (3rd positional arg = projectId)
|
|
in_proj = await list_datasets("mine", owner_tok, str(proj_id))
|
|
ids_in_proj = [d.id for d in in_proj]
|
|
self.assertIn(ds.id, ids_in_proj)
|
|
self.assertNotIn(ds2.id, ids_in_proj)
|
|
self.assertTrue(all(d.researchProjectId == str(proj_id) for d in in_proj))
|
|
|
|
async def test_update_label_map_sanitizes_and_persists(self) -> None:
|
|
"""update_dataset accepts a per-value label map, sanitizes it, and round-trips it (migration 027)."""
|
|
from src.imagehub_routes import (
|
|
DatasetCreateIn,
|
|
DatasetUpdateIn,
|
|
create_dataset,
|
|
get_dataset,
|
|
update_dataset,
|
|
)
|
|
|
|
owner = await self._seed_user()
|
|
owner_tok = _bearer(owner, ["viewer"])
|
|
|
|
ds = await create_dataset(DatasetCreateIn(name="KiTS labels"), owner_tok)
|
|
self._dataset_ids.append(uuid.UUID(ds.id))
|
|
self.assertEqual(ds.labelMap, {}) # empty by default
|
|
|
|
# garbage keys/values are dropped; valid ones trimmed + kept
|
|
updated = await update_dataset(
|
|
ds.id,
|
|
DatasetUpdateIn(labelMap={"1": "kidney", "2": "tumor", "3": "cyst", "0": "bad", "x": "bad"}),
|
|
owner_tok,
|
|
)
|
|
self.assertEqual(updated.labelMap, {"1": "kidney", "2": "tumor", "3": "cyst"})
|
|
|
|
# persisted: a fresh read returns the same map
|
|
fresh = await get_dataset(ds.id, owner_tok)
|
|
self.assertEqual(fresh.labelMap, {"1": "kidney", "2": "tumor", "3": "cyst"})
|
|
|
|
async def test_review_persists_decision_and_stats(self) -> None:
|
|
"""review_task writes a structured review event; review-stats tallies it per reviewer (025)."""
|
|
from sqlalchemy import select
|
|
|
|
from src.imagehub_routes import ReviewIn, review_stats, review_task
|
|
from src.initiative_db.engine import get_session
|
|
from src.initiative_db.models import (
|
|
ImagehubBlob,
|
|
ImagehubDataset,
|
|
ImagehubDatasetFile,
|
|
ImagehubDatasetStage,
|
|
ImagehubTask,
|
|
ImagehubTaskReviewEvent,
|
|
)
|
|
|
|
owner = await self._seed_user()
|
|
owner_tok = _bearer(owner, ["viewer"])
|
|
|
|
# build the minimal chain (no upload): dataset + a Review stage + a file + a task already
|
|
# advanced to that Review stage, assigned to the owner.
|
|
ds_id, stage_id, file_id, task_id = (uuid.uuid4() for _ in range(4))
|
|
sha = uuid.uuid4().hex
|
|
async with get_session() as session:
|
|
session.add(ImagehubDataset(id=ds_id, owner_user_id=owner, name="Review demo"))
|
|
session.add(
|
|
ImagehubDatasetStage(id=stage_id, dataset_id=ds_id, name="Rà soát 1", kind="review", seq=1)
|
|
)
|
|
session.add(ImagehubBlob(sha256=sha, size_bytes=1))
|
|
session.add(
|
|
ImagehubDatasetFile(id=file_id, dataset_id=ds_id, logical_path="ct.nii.gz", blob_sha256=sha)
|
|
)
|
|
session.add(
|
|
ImagehubTask(
|
|
id=task_id, dataset_id=ds_id, dataset_file_id=file_id, name="ct.nii.gz",
|
|
current_stage_id=stage_id, pipeline_state="inReview", queue_status="assigned",
|
|
assignee_user_id=owner,
|
|
)
|
|
)
|
|
await session.commit()
|
|
self._dataset_ids.append(ds_id) # cascade-cleans stage/file/task/events in teardown
|
|
|
|
# accept the review → a structured event is persisted (decision + reviewer + stage + note)
|
|
await review_task(str(ds_id), str(task_id), ReviewIn(decision="accept", note="Đạt"), owner_tok)
|
|
async with get_session() as session:
|
|
evs = (
|
|
await session.execute(
|
|
select(ImagehubTaskReviewEvent).where(ImagehubTaskReviewEvent.task_id == task_id)
|
|
)
|
|
).scalars().all()
|
|
self.assertEqual(len(evs), 1)
|
|
self.assertEqual(evs[0].decision, "accept")
|
|
self.assertEqual(evs[0].reviewer_user_id, owner)
|
|
self.assertEqual(evs[0].stage_id, stage_id)
|
|
self.assertEqual(evs[0].note, "Đạt")
|
|
|
|
# the stats endpoint tallies it for the reviewer (authorization is the LAST positional arg)
|
|
stats = await review_stats(str(ds_id), str(owner), 30, owner_tok)
|
|
self.assertEqual(stats.accepted, 1)
|
|
self.assertEqual(stats.rejected, 0)
|
|
# a foreign reviewer has no tally
|
|
empty = await review_stats(str(ds_id), str(uuid.uuid4()), 30, owner_tok)
|
|
self.assertEqual(empty.accepted, 0)
|
|
|
|
async def test_create_upload_dedup_version_authz_audit(self) -> None:
|
|
from fastapi import HTTPException
|
|
from sqlalchemy import func, select
|
|
|
|
from src.imagehub_routes import (
|
|
DatasetCreateIn,
|
|
VersionCreateIn,
|
|
create_dataset,
|
|
create_version,
|
|
get_dataset,
|
|
list_audit,
|
|
list_datasets,
|
|
list_files,
|
|
list_versions,
|
|
upload_files,
|
|
)
|
|
from src.initiative_db.engine import get_session
|
|
from src.initiative_db.models import ImagehubBlob, ImagehubDatasetFile
|
|
|
|
owner = await self._seed_user()
|
|
admin = await self._seed_user(admin=True)
|
|
other = await self._seed_user()
|
|
owner_tok = _bearer(owner, ["viewer"])
|
|
admin_tok = _bearer(admin, ["admin"])
|
|
other_tok = _bearer(other, ["viewer"])
|
|
|
|
# create
|
|
ds = await create_dataset(
|
|
DatasetCreateIn(name="CT Ngực thử nghiệm", description="demo", modalityTags=["CT"]),
|
|
owner_tok,
|
|
)
|
|
self._dataset_ids.append(uuid.UUID(ds.id))
|
|
self.assertEqual(ds.name, "CT Ngực thử nghiệm")
|
|
self.assertEqual(ds.modalityTags, ["CT"])
|
|
self.assertEqual(ds.fileCount, 0)
|
|
|
|
# upload the SAME content under two names → content-addressed dedup
|
|
blob_bytes = uuid.uuid4().bytes * 64 # unique per run
|
|
res = await upload_files(
|
|
ds.id, [_upload("scan_a.bin", blob_bytes), _upload("scan_b.bin", blob_bytes)], owner_tok
|
|
)
|
|
self.assertTrue(res["ok"])
|
|
shas = {f["sha256"] for f in res["files"]}
|
|
self.assertEqual(len(shas), 1, "same content must hash to one sha256")
|
|
deduped_flags = sorted(f["deduped"] for f in res["files"])
|
|
self.assertEqual(deduped_flags, [False, True], "first stores the blob, second dedups")
|
|
|
|
# DB: exactly one blob row for that sha256, two file rows for the dataset
|
|
sha = next(iter(shas))
|
|
async with get_session() as session:
|
|
blob_count = (
|
|
await session.execute(
|
|
select(func.count()).select_from(ImagehubBlob).where(ImagehubBlob.sha256 == sha)
|
|
)
|
|
).scalar_one()
|
|
file_count = (
|
|
await session.execute(
|
|
select(func.count())
|
|
.select_from(ImagehubDatasetFile)
|
|
.where(ImagehubDatasetFile.dataset_id == uuid.UUID(ds.id))
|
|
)
|
|
).scalar_one()
|
|
self.assertEqual(blob_count, 1)
|
|
self.assertEqual(file_count, 2)
|
|
|
|
# browse files (each carries a presigned download URL)
|
|
files = await list_files(ds.id, owner_tok)
|
|
self.assertEqual(len(files), 2)
|
|
self.assertTrue(all(f.downloadUrl for f in files))
|
|
|
|
# authz: a non-admin other user can't see or read it
|
|
owner_list = await list_datasets("mine", owner_tok)
|
|
self.assertIn(ds.id, [d.id for d in owner_list])
|
|
other_list = await list_datasets("all", other_tok) # non-admin: scope=all ignored
|
|
self.assertNotIn(ds.id, [d.id for d in other_list])
|
|
with self.assertRaises(HTTPException) as ctx:
|
|
await get_dataset(ds.id, other_tok)
|
|
self.assertEqual(ctx.exception.status_code, 404)
|
|
|
|
# admin sees every dataset (the clinical data repository)
|
|
admin_list = await list_datasets("all", admin_tok)
|
|
self.assertIn(ds.id, [d.id for d in admin_list])
|
|
|
|
# version snapshot freezes the 2-file manifest
|
|
ver = await create_version(ds.id, VersionCreateIn(message="phiên bản đầu"), owner_tok)
|
|
self.assertEqual(ver.seq, 1)
|
|
self.assertEqual(ver.fileCount, 2)
|
|
versions = await list_versions(ds.id, owner_tok)
|
|
self.assertEqual(len(versions), 1)
|
|
|
|
# audit trail recorded each mutation
|
|
audit = await list_audit(ds.id, owner_tok)
|
|
actions = [a.action for a in audit]
|
|
self.assertIn("Tạo bộ dữ liệu", actions)
|
|
self.assertIn("Tải tệp lên", actions)
|
|
self.assertIn("Tạo phiên bản", actions)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|