sciagent code + Gitea Actions CI/CD
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,407 @@
|
||||
"""Tests for the ImageHub dataset routes (milestone 1 walking skeleton).
|
||||
|
||||
Pure-helper unit tests always run. The full integration test (create dataset → upload with
|
||||
content-addressed dedup → version snapshot → owner/admin authz → audit) runs only when BOTH:
|
||||
- INITIATIVE_DATABASE_URL points at PostgreSQL (asyncpg), and
|
||||
- S3_ENDPOINT_URL is set (a reachable MinIO; the dev stack maps it to http://localhost:19000).
|
||||
|
||||
export INITIATIVE_DATABASE_URL="postgresql+asyncpg://initiative:initiative_secret@127.0.0.1:15432/initiatives"
|
||||
export S3_ENDPOINT_URL="http://localhost:19000" S3_ACCESS_KEY=minio_user S3_SECRET_KEY=minio_password \\
|
||||
S3_BUCKET_ATTACHMENTS=initiative-attachments S3_BUCKET_EXPORTS=initiative-exports \\
|
||||
S3_BUCKET_QUARANTINE=initiative-quarantine S3_PUBLIC_ENDPOINT_URL=http://localhost:19000
|
||||
cd be0 && python -m unittest tests.test_imagehub_datasets -v
|
||||
|
||||
Prereq for the integration test: migration 017_imagehub_datasets.sql applied (compose init mount
|
||||
or scripts/apply_initiative_migrations.py).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import os
|
||||
import unittest
|
||||
import uuid
|
||||
|
||||
_RUN_DB = os.getenv("INITIATIVE_DATABASE_URL", "").strip().lower().startswith("postgresql")
|
||||
_RUN_S3 = bool(os.getenv("S3_ENDPOINT_URL", "").strip())
|
||||
|
||||
# Let the module (which imports src.minio.storage → S3Settings()) import even when not running
|
||||
# against a real MinIO, so the pure-unit tests below can always run. These defaults match the
|
||||
# dev stack's host-mapped MinIO; the integration test only fires when S3_ENDPOINT_URL was set.
|
||||
os.environ.setdefault("S3_ENDPOINT_URL", "http://localhost:19000")
|
||||
os.environ.setdefault("S3_ACCESS_KEY", "minio_user")
|
||||
os.environ.setdefault("S3_SECRET_KEY", "minio_password")
|
||||
os.environ.setdefault("S3_BUCKET_ATTACHMENTS", "initiative-attachments")
|
||||
os.environ.setdefault("S3_BUCKET_EXPORTS", "initiative-exports")
|
||||
os.environ.setdefault("S3_BUCKET_QUARANTINE", "initiative-quarantine")
|
||||
os.environ.setdefault("S3_PUBLIC_ENDPOINT_URL", "http://localhost:19000")
|
||||
|
||||
|
||||
class PureHelperTests(unittest.TestCase):
|
||||
"""No DB / no network — string + sniff helpers."""
|
||||
|
||||
def test_build_blob_key_is_content_addressed(self) -> None:
|
||||
from src.minio.storage import S3Storage
|
||||
|
||||
key = S3Storage.build_blob_key("AbCdEf0123456789")
|
||||
self.assertEqual(key, "blobs/ab/cd/abcdef0123456789")
|
||||
|
||||
def test_slugify_strips_diacritics_and_punct(self) -> None:
|
||||
from src.imagehub_routes import _slugify
|
||||
|
||||
self.assertEqual(_slugify("Bộ dữ liệu CT Ngực!! 2026"), "bo-du-lieu-ct-nguc-2026")
|
||||
self.assertEqual(_slugify(""), "dataset")
|
||||
|
||||
def test_safe_logical_path_basename_only(self) -> None:
|
||||
from src.imagehub_routes import _safe_logical_path
|
||||
|
||||
self.assertEqual(_safe_logical_path("/evil/../a b.dcm"), "a_b.dcm")
|
||||
self.assertEqual(_safe_logical_path("C:\\scans\\series1.nii.gz"), "series1.nii.gz")
|
||||
self.assertEqual(_safe_logical_path(""), "file")
|
||||
|
||||
def test_safe_folder_path_preserves_dirs_rejects_traversal(self) -> None:
|
||||
from src.imagehub_routes import _safe_folder_path
|
||||
|
||||
# the directory is kept (basename dropped) so an uploaded tree round-trips
|
||||
self.assertEqual(_safe_folder_path("imagesTr/ct_001.nii.gz"), "imagesTr")
|
||||
self.assertEqual(_safe_folder_path("a/b/c/scan.nii.gz"), "a/b/c")
|
||||
# no directory component → dataset root
|
||||
self.assertEqual(_safe_folder_path("readme.txt"), "")
|
||||
self.assertEqual(_safe_folder_path(""), "")
|
||||
# leading slash + ".." traversal segments are stripped
|
||||
self.assertEqual(_safe_folder_path("/evil/../x/y.dcm"), "evil/x")
|
||||
# backslashes normalise to forward slashes
|
||||
self.assertEqual(_safe_folder_path("labelsTr\\sub\\m.nii.gz"), "labelsTr/sub")
|
||||
|
||||
def test_coerce_tags(self) -> None:
|
||||
from src.imagehub_routes import _coerce_tags
|
||||
|
||||
self.assertEqual(_coerce_tags(["CT", " MRI ", "", 7]), ["CT", "MRI", "7"])
|
||||
self.assertEqual(_coerce_tags("nope"), [])
|
||||
|
||||
def test_coerce_label_map(self) -> None:
|
||||
from src.imagehub_routes import _coerce_label_map
|
||||
|
||||
# valid entries kept + trimmed; non-positive / non-int keys and empty/non-str values dropped
|
||||
self.assertEqual(
|
||||
_coerce_label_map(
|
||||
{"1": " kidney ", "2": "tumor", "0": "bad", "-3": "bad", "x": "bad", "4": "", "+5": "bad", "1_0": "bad"}
|
||||
),
|
||||
{"1": "kidney", "2": "tumor"},
|
||||
)
|
||||
# integer keys coerce to strings; non-dict input → {}
|
||||
self.assertEqual(_coerce_label_map({1: "kidney"}), {"1": "kidney"})
|
||||
self.assertEqual(_coerce_label_map("nope"), {})
|
||||
self.assertEqual(_coerce_label_map(None), {})
|
||||
|
||||
def test_sniff_never_raises_on_non_imaging(self) -> None:
|
||||
from src.imagehub_routes import _sniff_imaging_meta
|
||||
|
||||
# plain bytes → {}; a .dcm name with junk must degrade to {} (never raise)
|
||||
self.assertEqual(_sniff_imaging_meta("notes.txt", b"hello world", "text/plain"), {})
|
||||
self.assertIsInstance(_sniff_imaging_meta("x.dcm", b"DICM" + b"\x00" * 200, "application/dicom"), dict)
|
||||
|
||||
|
||||
def _bearer(uid: uuid.UUID, roles: list[str]) -> str:
|
||||
import jwt
|
||||
|
||||
from src.auth_jwt import jwt_secret
|
||||
|
||||
return "Bearer " + jwt.encode({"sub": str(uid), "roles": roles, "cv": 0}, jwt_secret(), algorithm="HS256")
|
||||
|
||||
|
||||
def _upload(name: str, data: bytes, ctype: str = "application/octet-stream"):
|
||||
from starlette.datastructures import Headers, UploadFile
|
||||
|
||||
return UploadFile(io.BytesIO(data), filename=name, headers=Headers({"content-type": ctype}))
|
||||
|
||||
|
||||
@unittest.skipUnless(
|
||||
_RUN_DB and _RUN_S3,
|
||||
"Set INITIATIVE_DATABASE_URL=postgresql+asyncpg://… and S3_ENDPOINT_URL=… to run the integration test",
|
||||
)
|
||||
class ImagehubDatasetDbTests(unittest.IsolatedAsyncioTestCase):
|
||||
"""End-to-end: create → upload (content-addressed dedup) → version → owner/admin authz → audit."""
|
||||
|
||||
async def asyncSetUp(self) -> None:
|
||||
from src.initiative_db import engine as eng
|
||||
from src.minio.storage import storage
|
||||
|
||||
await eng.dispose_engine()
|
||||
await eng.init_engine()
|
||||
try:
|
||||
await storage.ensure_buckets_exist()
|
||||
except Exception as exc: # MinIO not reachable → skip rather than error
|
||||
self.skipTest(f"MinIO not reachable: {exc}")
|
||||
self._user_ids: list[uuid.UUID] = []
|
||||
self._dataset_ids: list[uuid.UUID] = []
|
||||
|
||||
async def asyncTearDown(self) -> None:
|
||||
from sqlalchemy import delete
|
||||
|
||||
from src.initiative_db import engine as eng
|
||||
from src.initiative_db.engine import get_session
|
||||
from src.initiative_db.models import ImagehubDataset, User
|
||||
|
||||
async with get_session() as session:
|
||||
for did in self._dataset_ids:
|
||||
await session.execute(delete(ImagehubDataset).where(ImagehubDataset.id == did))
|
||||
for uid in self._user_ids:
|
||||
await session.execute(delete(User).where(User.id == uid))
|
||||
await session.commit()
|
||||
await eng.dispose_engine()
|
||||
|
||||
async def _seed_user(self, *, admin: bool = False) -> uuid.UUID:
|
||||
from src.initiative_db.engine import get_session
|
||||
from src.initiative_db.models import User
|
||||
|
||||
uid = uuid.uuid4()
|
||||
async with get_session() as session:
|
||||
session.add(
|
||||
User(
|
||||
id=uid,
|
||||
email=f"ih-{uid.hex[:10]}@ump.edu.vn",
|
||||
password_hash="x",
|
||||
full_name=("Quản trị" if admin else "Nhà nghiên cứu") + " Test",
|
||||
)
|
||||
)
|
||||
await session.commit()
|
||||
self._user_ids.append(uid)
|
||||
return uid
|
||||
|
||||
async def test_dataset_research_project_link(self) -> None:
|
||||
"""A dataset can be created linked to a research project ("workspace"); the list can be
|
||||
filtered to that project; bad/foreign project ids are rejected (migration 024)."""
|
||||
from fastapi import HTTPException
|
||||
|
||||
from src.imagehub_routes import DatasetCreateIn, create_dataset, list_datasets
|
||||
from src.initiative_db.engine import get_session
|
||||
from src.initiative_db.models import ResearchProject
|
||||
|
||||
owner = await self._seed_user()
|
||||
owner_tok = _bearer(owner, ["viewer"])
|
||||
|
||||
# seed a research project ("workspace") owned by the user (cascade-cleaned with the user)
|
||||
proj_id = uuid.uuid4()
|
||||
async with get_session() as session:
|
||||
session.add(ResearchProject(id=proj_id, owner_user_id=owner, title="Đề tài thử nghiệm"))
|
||||
await session.commit()
|
||||
|
||||
# create a dataset linked to the project → the link is persisted
|
||||
ds = await create_dataset(
|
||||
DatasetCreateIn(name="Bộ dữ liệu thuộc đề tài", researchProjectId=str(proj_id)),
|
||||
owner_tok,
|
||||
)
|
||||
self._dataset_ids.append(uuid.UUID(ds.id))
|
||||
self.assertEqual(ds.researchProjectId, str(proj_id))
|
||||
|
||||
# a standalone dataset (no project) is still allowed and stays unlinked
|
||||
ds2 = await create_dataset(DatasetCreateIn(name="Bộ dữ liệu độc lập"), owner_tok)
|
||||
self._dataset_ids.append(uuid.UUID(ds2.id))
|
||||
self.assertIsNone(ds2.researchProjectId)
|
||||
|
||||
# a non-existent project id is rejected (422)
|
||||
with self.assertRaises(HTTPException) as ctx:
|
||||
await create_dataset(
|
||||
DatasetCreateIn(name="x", researchProjectId=str(uuid.uuid4())), owner_tok
|
||||
)
|
||||
self.assertEqual(ctx.exception.status_code, 422)
|
||||
|
||||
# ?projectId= filters the list to that project only (3rd positional arg = projectId)
|
||||
in_proj = await list_datasets("mine", owner_tok, str(proj_id))
|
||||
ids_in_proj = [d.id for d in in_proj]
|
||||
self.assertIn(ds.id, ids_in_proj)
|
||||
self.assertNotIn(ds2.id, ids_in_proj)
|
||||
self.assertTrue(all(d.researchProjectId == str(proj_id) for d in in_proj))
|
||||
|
||||
async def test_update_label_map_sanitizes_and_persists(self) -> None:
|
||||
"""update_dataset accepts a per-value label map, sanitizes it, and round-trips it (migration 027)."""
|
||||
from src.imagehub_routes import (
|
||||
DatasetCreateIn,
|
||||
DatasetUpdateIn,
|
||||
create_dataset,
|
||||
get_dataset,
|
||||
update_dataset,
|
||||
)
|
||||
|
||||
owner = await self._seed_user()
|
||||
owner_tok = _bearer(owner, ["viewer"])
|
||||
|
||||
ds = await create_dataset(DatasetCreateIn(name="KiTS labels"), owner_tok)
|
||||
self._dataset_ids.append(uuid.UUID(ds.id))
|
||||
self.assertEqual(ds.labelMap, {}) # empty by default
|
||||
|
||||
# garbage keys/values are dropped; valid ones trimmed + kept
|
||||
updated = await update_dataset(
|
||||
ds.id,
|
||||
DatasetUpdateIn(labelMap={"1": "kidney", "2": "tumor", "3": "cyst", "0": "bad", "x": "bad"}),
|
||||
owner_tok,
|
||||
)
|
||||
self.assertEqual(updated.labelMap, {"1": "kidney", "2": "tumor", "3": "cyst"})
|
||||
|
||||
# persisted: a fresh read returns the same map
|
||||
fresh = await get_dataset(ds.id, owner_tok)
|
||||
self.assertEqual(fresh.labelMap, {"1": "kidney", "2": "tumor", "3": "cyst"})
|
||||
|
||||
async def test_review_persists_decision_and_stats(self) -> None:
|
||||
"""review_task writes a structured review event; review-stats tallies it per reviewer (025)."""
|
||||
from sqlalchemy import select
|
||||
|
||||
from src.imagehub_routes import ReviewIn, review_stats, review_task
|
||||
from src.initiative_db.engine import get_session
|
||||
from src.initiative_db.models import (
|
||||
ImagehubBlob,
|
||||
ImagehubDataset,
|
||||
ImagehubDatasetFile,
|
||||
ImagehubDatasetStage,
|
||||
ImagehubTask,
|
||||
ImagehubTaskReviewEvent,
|
||||
)
|
||||
|
||||
owner = await self._seed_user()
|
||||
owner_tok = _bearer(owner, ["viewer"])
|
||||
|
||||
# build the minimal chain (no upload): dataset + a Review stage + a file + a task already
|
||||
# advanced to that Review stage, assigned to the owner.
|
||||
ds_id, stage_id, file_id, task_id = (uuid.uuid4() for _ in range(4))
|
||||
sha = uuid.uuid4().hex
|
||||
async with get_session() as session:
|
||||
session.add(ImagehubDataset(id=ds_id, owner_user_id=owner, name="Review demo"))
|
||||
session.add(
|
||||
ImagehubDatasetStage(id=stage_id, dataset_id=ds_id, name="Rà soát 1", kind="review", seq=1)
|
||||
)
|
||||
session.add(ImagehubBlob(sha256=sha, size_bytes=1))
|
||||
session.add(
|
||||
ImagehubDatasetFile(id=file_id, dataset_id=ds_id, logical_path="ct.nii.gz", blob_sha256=sha)
|
||||
)
|
||||
session.add(
|
||||
ImagehubTask(
|
||||
id=task_id, dataset_id=ds_id, dataset_file_id=file_id, name="ct.nii.gz",
|
||||
current_stage_id=stage_id, pipeline_state="inReview", queue_status="assigned",
|
||||
assignee_user_id=owner,
|
||||
)
|
||||
)
|
||||
await session.commit()
|
||||
self._dataset_ids.append(ds_id) # cascade-cleans stage/file/task/events in teardown
|
||||
|
||||
# accept the review → a structured event is persisted (decision + reviewer + stage + note)
|
||||
await review_task(str(ds_id), str(task_id), ReviewIn(decision="accept", note="Đạt"), owner_tok)
|
||||
async with get_session() as session:
|
||||
evs = (
|
||||
await session.execute(
|
||||
select(ImagehubTaskReviewEvent).where(ImagehubTaskReviewEvent.task_id == task_id)
|
||||
)
|
||||
).scalars().all()
|
||||
self.assertEqual(len(evs), 1)
|
||||
self.assertEqual(evs[0].decision, "accept")
|
||||
self.assertEqual(evs[0].reviewer_user_id, owner)
|
||||
self.assertEqual(evs[0].stage_id, stage_id)
|
||||
self.assertEqual(evs[0].note, "Đạt")
|
||||
|
||||
# the stats endpoint tallies it for the reviewer (authorization is the LAST positional arg)
|
||||
stats = await review_stats(str(ds_id), str(owner), 30, owner_tok)
|
||||
self.assertEqual(stats.accepted, 1)
|
||||
self.assertEqual(stats.rejected, 0)
|
||||
# a foreign reviewer has no tally
|
||||
empty = await review_stats(str(ds_id), str(uuid.uuid4()), 30, owner_tok)
|
||||
self.assertEqual(empty.accepted, 0)
|
||||
|
||||
async def test_create_upload_dedup_version_authz_audit(self) -> None:
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import func, select
|
||||
|
||||
from src.imagehub_routes import (
|
||||
DatasetCreateIn,
|
||||
VersionCreateIn,
|
||||
create_dataset,
|
||||
create_version,
|
||||
get_dataset,
|
||||
list_audit,
|
||||
list_datasets,
|
||||
list_files,
|
||||
list_versions,
|
||||
upload_files,
|
||||
)
|
||||
from src.initiative_db.engine import get_session
|
||||
from src.initiative_db.models import ImagehubBlob, ImagehubDatasetFile
|
||||
|
||||
owner = await self._seed_user()
|
||||
admin = await self._seed_user(admin=True)
|
||||
other = await self._seed_user()
|
||||
owner_tok = _bearer(owner, ["viewer"])
|
||||
admin_tok = _bearer(admin, ["admin"])
|
||||
other_tok = _bearer(other, ["viewer"])
|
||||
|
||||
# create
|
||||
ds = await create_dataset(
|
||||
DatasetCreateIn(name="CT Ngực thử nghiệm", description="demo", modalityTags=["CT"]),
|
||||
owner_tok,
|
||||
)
|
||||
self._dataset_ids.append(uuid.UUID(ds.id))
|
||||
self.assertEqual(ds.name, "CT Ngực thử nghiệm")
|
||||
self.assertEqual(ds.modalityTags, ["CT"])
|
||||
self.assertEqual(ds.fileCount, 0)
|
||||
|
||||
# upload the SAME content under two names → content-addressed dedup
|
||||
blob_bytes = uuid.uuid4().bytes * 64 # unique per run
|
||||
res = await upload_files(
|
||||
ds.id, [_upload("scan_a.bin", blob_bytes), _upload("scan_b.bin", blob_bytes)], owner_tok
|
||||
)
|
||||
self.assertTrue(res["ok"])
|
||||
shas = {f["sha256"] for f in res["files"]}
|
||||
self.assertEqual(len(shas), 1, "same content must hash to one sha256")
|
||||
deduped_flags = sorted(f["deduped"] for f in res["files"])
|
||||
self.assertEqual(deduped_flags, [False, True], "first stores the blob, second dedups")
|
||||
|
||||
# DB: exactly one blob row for that sha256, two file rows for the dataset
|
||||
sha = next(iter(shas))
|
||||
async with get_session() as session:
|
||||
blob_count = (
|
||||
await session.execute(
|
||||
select(func.count()).select_from(ImagehubBlob).where(ImagehubBlob.sha256 == sha)
|
||||
)
|
||||
).scalar_one()
|
||||
file_count = (
|
||||
await session.execute(
|
||||
select(func.count())
|
||||
.select_from(ImagehubDatasetFile)
|
||||
.where(ImagehubDatasetFile.dataset_id == uuid.UUID(ds.id))
|
||||
)
|
||||
).scalar_one()
|
||||
self.assertEqual(blob_count, 1)
|
||||
self.assertEqual(file_count, 2)
|
||||
|
||||
# browse files (each carries a presigned download URL)
|
||||
files = await list_files(ds.id, owner_tok)
|
||||
self.assertEqual(len(files), 2)
|
||||
self.assertTrue(all(f.downloadUrl for f in files))
|
||||
|
||||
# authz: a non-admin other user can't see or read it
|
||||
owner_list = await list_datasets("mine", owner_tok)
|
||||
self.assertIn(ds.id, [d.id for d in owner_list])
|
||||
other_list = await list_datasets("all", other_tok) # non-admin: scope=all ignored
|
||||
self.assertNotIn(ds.id, [d.id for d in other_list])
|
||||
with self.assertRaises(HTTPException) as ctx:
|
||||
await get_dataset(ds.id, other_tok)
|
||||
self.assertEqual(ctx.exception.status_code, 404)
|
||||
|
||||
# admin sees every dataset (the clinical data repository)
|
||||
admin_list = await list_datasets("all", admin_tok)
|
||||
self.assertIn(ds.id, [d.id for d in admin_list])
|
||||
|
||||
# version snapshot freezes the 2-file manifest
|
||||
ver = await create_version(ds.id, VersionCreateIn(message="phiên bản đầu"), owner_tok)
|
||||
self.assertEqual(ver.seq, 1)
|
||||
self.assertEqual(ver.fileCount, 2)
|
||||
versions = await list_versions(ds.id, owner_tok)
|
||||
self.assertEqual(len(versions), 1)
|
||||
|
||||
# audit trail recorded each mutation
|
||||
audit = await list_audit(ds.id, owner_tok)
|
||||
actions = [a.action for a in audit]
|
||||
self.assertIn("Tạo bộ dữ liệu", actions)
|
||||
self.assertIn("Tải tệp lên", actions)
|
||||
self.assertIn("Tạo phiên bản", actions)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user