"""Tests for the ImageHub dataset routes (milestone 1 walking skeleton). Pure-helper unit tests always run. The full integration test (create dataset → upload with content-addressed dedup → version snapshot → owner/admin authz → audit) runs only when BOTH: - INITIATIVE_DATABASE_URL points at PostgreSQL (asyncpg), and - S3_ENDPOINT_URL is set (a reachable MinIO; the dev stack maps it to http://localhost:19000). export INITIATIVE_DATABASE_URL="postgresql+asyncpg://initiative:initiative_secret@127.0.0.1:15432/initiatives" export S3_ENDPOINT_URL="http://localhost:19000" S3_ACCESS_KEY=minio_user S3_SECRET_KEY=minio_password \\ S3_BUCKET_ATTACHMENTS=initiative-attachments S3_BUCKET_EXPORTS=initiative-exports \\ S3_BUCKET_QUARANTINE=initiative-quarantine S3_PUBLIC_ENDPOINT_URL=http://localhost:19000 cd be0 && python -m unittest tests.test_imagehub_datasets -v Prereq for the integration test: migration 017_imagehub_datasets.sql applied (compose init mount or scripts/apply_initiative_migrations.py). """ from __future__ import annotations import io import os import unittest import uuid _RUN_DB = os.getenv("INITIATIVE_DATABASE_URL", "").strip().lower().startswith("postgresql") _RUN_S3 = bool(os.getenv("S3_ENDPOINT_URL", "").strip()) # Let the module (which imports src.minio.storage → S3Settings()) import even when not running # against a real MinIO, so the pure-unit tests below can always run. These defaults match the # dev stack's host-mapped MinIO; the integration test only fires when S3_ENDPOINT_URL was set. os.environ.setdefault("S3_ENDPOINT_URL", "http://localhost:19000") os.environ.setdefault("S3_ACCESS_KEY", "minio_user") os.environ.setdefault("S3_SECRET_KEY", "minio_password") os.environ.setdefault("S3_BUCKET_ATTACHMENTS", "initiative-attachments") os.environ.setdefault("S3_BUCKET_EXPORTS", "initiative-exports") os.environ.setdefault("S3_BUCKET_QUARANTINE", "initiative-quarantine") os.environ.setdefault("S3_PUBLIC_ENDPOINT_URL", "http://localhost:19000") class PureHelperTests(unittest.TestCase): """No DB / no network — string + sniff helpers.""" def test_build_blob_key_is_content_addressed(self) -> None: from src.minio.storage import S3Storage key = S3Storage.build_blob_key("AbCdEf0123456789") self.assertEqual(key, "blobs/ab/cd/abcdef0123456789") def test_slugify_strips_diacritics_and_punct(self) -> None: from src.imagehub_routes import _slugify self.assertEqual(_slugify("Bộ dữ liệu CT Ngực!! 2026"), "bo-du-lieu-ct-nguc-2026") self.assertEqual(_slugify(""), "dataset") def test_safe_logical_path_basename_only(self) -> None: from src.imagehub_routes import _safe_logical_path self.assertEqual(_safe_logical_path("/evil/../a b.dcm"), "a_b.dcm") self.assertEqual(_safe_logical_path("C:\\scans\\series1.nii.gz"), "series1.nii.gz") self.assertEqual(_safe_logical_path(""), "file") def test_safe_folder_path_preserves_dirs_rejects_traversal(self) -> None: from src.imagehub_routes import _safe_folder_path # the directory is kept (basename dropped) so an uploaded tree round-trips self.assertEqual(_safe_folder_path("imagesTr/ct_001.nii.gz"), "imagesTr") self.assertEqual(_safe_folder_path("a/b/c/scan.nii.gz"), "a/b/c") # no directory component → dataset root self.assertEqual(_safe_folder_path("readme.txt"), "") self.assertEqual(_safe_folder_path(""), "") # leading slash + ".." traversal segments are stripped self.assertEqual(_safe_folder_path("/evil/../x/y.dcm"), "evil/x") # backslashes normalise to forward slashes self.assertEqual(_safe_folder_path("labelsTr\\sub\\m.nii.gz"), "labelsTr/sub") def test_coerce_tags(self) -> None: from src.imagehub_routes import _coerce_tags self.assertEqual(_coerce_tags(["CT", " MRI ", "", 7]), ["CT", "MRI", "7"]) self.assertEqual(_coerce_tags("nope"), []) def test_coerce_label_map(self) -> None: from src.imagehub_routes import _coerce_label_map # valid entries kept + trimmed; non-positive / non-int keys and empty/non-str values dropped self.assertEqual( _coerce_label_map( {"1": " kidney ", "2": "tumor", "0": "bad", "-3": "bad", "x": "bad", "4": "", "+5": "bad", "1_0": "bad"} ), {"1": "kidney", "2": "tumor"}, ) # integer keys coerce to strings; non-dict input → {} self.assertEqual(_coerce_label_map({1: "kidney"}), {"1": "kidney"}) self.assertEqual(_coerce_label_map("nope"), {}) self.assertEqual(_coerce_label_map(None), {}) def test_sniff_never_raises_on_non_imaging(self) -> None: from src.imagehub_routes import _sniff_imaging_meta # plain bytes → {}; a .dcm name with junk must degrade to {} (never raise) self.assertEqual(_sniff_imaging_meta("notes.txt", b"hello world", "text/plain"), {}) self.assertIsInstance(_sniff_imaging_meta("x.dcm", b"DICM" + b"\x00" * 200, "application/dicom"), dict) def _bearer(uid: uuid.UUID, roles: list[str]) -> str: import jwt from src.auth_jwt import jwt_secret return "Bearer " + jwt.encode({"sub": str(uid), "roles": roles, "cv": 0}, jwt_secret(), algorithm="HS256") def _upload(name: str, data: bytes, ctype: str = "application/octet-stream"): from starlette.datastructures import Headers, UploadFile return UploadFile(io.BytesIO(data), filename=name, headers=Headers({"content-type": ctype})) @unittest.skipUnless( _RUN_DB and _RUN_S3, "Set INITIATIVE_DATABASE_URL=postgresql+asyncpg://… and S3_ENDPOINT_URL=… to run the integration test", ) class ImagehubDatasetDbTests(unittest.IsolatedAsyncioTestCase): """End-to-end: create → upload (content-addressed dedup) → version → owner/admin authz → audit.""" async def asyncSetUp(self) -> None: from src.initiative_db import engine as eng from src.minio.storage import storage await eng.dispose_engine() await eng.init_engine() try: await storage.ensure_buckets_exist() except Exception as exc: # MinIO not reachable → skip rather than error self.skipTest(f"MinIO not reachable: {exc}") self._user_ids: list[uuid.UUID] = [] self._dataset_ids: list[uuid.UUID] = [] async def asyncTearDown(self) -> None: from sqlalchemy import delete from src.initiative_db import engine as eng from src.initiative_db.engine import get_session from src.initiative_db.models import ImagehubDataset, User async with get_session() as session: for did in self._dataset_ids: await session.execute(delete(ImagehubDataset).where(ImagehubDataset.id == did)) for uid in self._user_ids: await session.execute(delete(User).where(User.id == uid)) await session.commit() await eng.dispose_engine() async def _seed_user(self, *, admin: bool = False) -> uuid.UUID: from src.initiative_db.engine import get_session from src.initiative_db.models import User uid = uuid.uuid4() async with get_session() as session: session.add( User( id=uid, email=f"ih-{uid.hex[:10]}@ump.edu.vn", password_hash="x", full_name=("Quản trị" if admin else "Nhà nghiên cứu") + " Test", ) ) await session.commit() self._user_ids.append(uid) return uid async def test_dataset_research_project_link(self) -> None: """A dataset can be created linked to a research project ("workspace"); the list can be filtered to that project; bad/foreign project ids are rejected (migration 024).""" from fastapi import HTTPException from src.imagehub_routes import DatasetCreateIn, create_dataset, list_datasets from src.initiative_db.engine import get_session from src.initiative_db.models import ResearchProject owner = await self._seed_user() owner_tok = _bearer(owner, ["viewer"]) # seed a research project ("workspace") owned by the user (cascade-cleaned with the user) proj_id = uuid.uuid4() async with get_session() as session: session.add(ResearchProject(id=proj_id, owner_user_id=owner, title="Đề tài thử nghiệm")) await session.commit() # create a dataset linked to the project → the link is persisted ds = await create_dataset( DatasetCreateIn(name="Bộ dữ liệu thuộc đề tài", researchProjectId=str(proj_id)), owner_tok, ) self._dataset_ids.append(uuid.UUID(ds.id)) self.assertEqual(ds.researchProjectId, str(proj_id)) # a standalone dataset (no project) is still allowed and stays unlinked ds2 = await create_dataset(DatasetCreateIn(name="Bộ dữ liệu độc lập"), owner_tok) self._dataset_ids.append(uuid.UUID(ds2.id)) self.assertIsNone(ds2.researchProjectId) # a non-existent project id is rejected (422) with self.assertRaises(HTTPException) as ctx: await create_dataset( DatasetCreateIn(name="x", researchProjectId=str(uuid.uuid4())), owner_tok ) self.assertEqual(ctx.exception.status_code, 422) # ?projectId= filters the list to that project only (3rd positional arg = projectId) in_proj = await list_datasets("mine", owner_tok, str(proj_id)) ids_in_proj = [d.id for d in in_proj] self.assertIn(ds.id, ids_in_proj) self.assertNotIn(ds2.id, ids_in_proj) self.assertTrue(all(d.researchProjectId == str(proj_id) for d in in_proj)) async def test_update_label_map_sanitizes_and_persists(self) -> None: """update_dataset accepts a per-value label map, sanitizes it, and round-trips it (migration 027).""" from src.imagehub_routes import ( DatasetCreateIn, DatasetUpdateIn, create_dataset, get_dataset, update_dataset, ) owner = await self._seed_user() owner_tok = _bearer(owner, ["viewer"]) ds = await create_dataset(DatasetCreateIn(name="KiTS labels"), owner_tok) self._dataset_ids.append(uuid.UUID(ds.id)) self.assertEqual(ds.labelMap, {}) # empty by default # garbage keys/values are dropped; valid ones trimmed + kept updated = await update_dataset( ds.id, DatasetUpdateIn(labelMap={"1": "kidney", "2": "tumor", "3": "cyst", "0": "bad", "x": "bad"}), owner_tok, ) self.assertEqual(updated.labelMap, {"1": "kidney", "2": "tumor", "3": "cyst"}) # persisted: a fresh read returns the same map fresh = await get_dataset(ds.id, owner_tok) self.assertEqual(fresh.labelMap, {"1": "kidney", "2": "tumor", "3": "cyst"}) async def test_review_persists_decision_and_stats(self) -> None: """review_task writes a structured review event; review-stats tallies it per reviewer (025).""" from sqlalchemy import select from src.imagehub_routes import ReviewIn, review_stats, review_task from src.initiative_db.engine import get_session from src.initiative_db.models import ( ImagehubBlob, ImagehubDataset, ImagehubDatasetFile, ImagehubDatasetStage, ImagehubTask, ImagehubTaskReviewEvent, ) owner = await self._seed_user() owner_tok = _bearer(owner, ["viewer"]) # build the minimal chain (no upload): dataset + a Review stage + a file + a task already # advanced to that Review stage, assigned to the owner. ds_id, stage_id, file_id, task_id = (uuid.uuid4() for _ in range(4)) sha = uuid.uuid4().hex async with get_session() as session: session.add(ImagehubDataset(id=ds_id, owner_user_id=owner, name="Review demo")) session.add( ImagehubDatasetStage(id=stage_id, dataset_id=ds_id, name="Rà soát 1", kind="review", seq=1) ) session.add(ImagehubBlob(sha256=sha, size_bytes=1)) session.add( ImagehubDatasetFile(id=file_id, dataset_id=ds_id, logical_path="ct.nii.gz", blob_sha256=sha) ) session.add( ImagehubTask( id=task_id, dataset_id=ds_id, dataset_file_id=file_id, name="ct.nii.gz", current_stage_id=stage_id, pipeline_state="inReview", queue_status="assigned", assignee_user_id=owner, ) ) await session.commit() self._dataset_ids.append(ds_id) # cascade-cleans stage/file/task/events in teardown # accept the review → a structured event is persisted (decision + reviewer + stage + note) await review_task(str(ds_id), str(task_id), ReviewIn(decision="accept", note="Đạt"), owner_tok) async with get_session() as session: evs = ( await session.execute( select(ImagehubTaskReviewEvent).where(ImagehubTaskReviewEvent.task_id == task_id) ) ).scalars().all() self.assertEqual(len(evs), 1) self.assertEqual(evs[0].decision, "accept") self.assertEqual(evs[0].reviewer_user_id, owner) self.assertEqual(evs[0].stage_id, stage_id) self.assertEqual(evs[0].note, "Đạt") # the stats endpoint tallies it for the reviewer (authorization is the LAST positional arg) stats = await review_stats(str(ds_id), str(owner), 30, owner_tok) self.assertEqual(stats.accepted, 1) self.assertEqual(stats.rejected, 0) # a foreign reviewer has no tally empty = await review_stats(str(ds_id), str(uuid.uuid4()), 30, owner_tok) self.assertEqual(empty.accepted, 0) async def test_create_upload_dedup_version_authz_audit(self) -> None: from fastapi import HTTPException from sqlalchemy import func, select from src.imagehub_routes import ( DatasetCreateIn, VersionCreateIn, create_dataset, create_version, get_dataset, list_audit, list_datasets, list_files, list_versions, upload_files, ) from src.initiative_db.engine import get_session from src.initiative_db.models import ImagehubBlob, ImagehubDatasetFile owner = await self._seed_user() admin = await self._seed_user(admin=True) other = await self._seed_user() owner_tok = _bearer(owner, ["viewer"]) admin_tok = _bearer(admin, ["admin"]) other_tok = _bearer(other, ["viewer"]) # create ds = await create_dataset( DatasetCreateIn(name="CT Ngực thử nghiệm", description="demo", modalityTags=["CT"]), owner_tok, ) self._dataset_ids.append(uuid.UUID(ds.id)) self.assertEqual(ds.name, "CT Ngực thử nghiệm") self.assertEqual(ds.modalityTags, ["CT"]) self.assertEqual(ds.fileCount, 0) # upload the SAME content under two names → content-addressed dedup blob_bytes = uuid.uuid4().bytes * 64 # unique per run res = await upload_files( ds.id, [_upload("scan_a.bin", blob_bytes), _upload("scan_b.bin", blob_bytes)], owner_tok ) self.assertTrue(res["ok"]) shas = {f["sha256"] for f in res["files"]} self.assertEqual(len(shas), 1, "same content must hash to one sha256") deduped_flags = sorted(f["deduped"] for f in res["files"]) self.assertEqual(deduped_flags, [False, True], "first stores the blob, second dedups") # DB: exactly one blob row for that sha256, two file rows for the dataset sha = next(iter(shas)) async with get_session() as session: blob_count = ( await session.execute( select(func.count()).select_from(ImagehubBlob).where(ImagehubBlob.sha256 == sha) ) ).scalar_one() file_count = ( await session.execute( select(func.count()) .select_from(ImagehubDatasetFile) .where(ImagehubDatasetFile.dataset_id == uuid.UUID(ds.id)) ) ).scalar_one() self.assertEqual(blob_count, 1) self.assertEqual(file_count, 2) # browse files (each carries a presigned download URL) files = await list_files(ds.id, owner_tok) self.assertEqual(len(files), 2) self.assertTrue(all(f.downloadUrl for f in files)) # authz: a non-admin other user can't see or read it owner_list = await list_datasets("mine", owner_tok) self.assertIn(ds.id, [d.id for d in owner_list]) other_list = await list_datasets("all", other_tok) # non-admin: scope=all ignored self.assertNotIn(ds.id, [d.id for d in other_list]) with self.assertRaises(HTTPException) as ctx: await get_dataset(ds.id, other_tok) self.assertEqual(ctx.exception.status_code, 404) # admin sees every dataset (the clinical data repository) admin_list = await list_datasets("all", admin_tok) self.assertIn(ds.id, [d.id for d in admin_list]) # version snapshot freezes the 2-file manifest ver = await create_version(ds.id, VersionCreateIn(message="phiên bản đầu"), owner_tok) self.assertEqual(ver.seq, 1) self.assertEqual(ver.fileCount, 2) versions = await list_versions(ds.id, owner_tok) self.assertEqual(len(versions), 1) # audit trail recorded each mutation audit = await list_audit(ds.id, owner_tok) actions = [a.action for a in audit] self.assertIn("Tạo bộ dữ liệu", actions) self.assertIn("Tải tệp lên", actions) self.assertIn("Tạo phiên bản", actions) if __name__ == "__main__": unittest.main()