sciagent code + Gitea Actions CI/CD
CI/CD / backend (push) Failing after 2m8s
CI/CD / frontend (push) Failing after 1m40s
CI/CD / deploy (push) Has been skipped

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Thinh Lam
2026-06-30 09:38:30 +07:00
commit 688fac73e9
1167 changed files with 158244 additions and 0 deletions
+407
View File
@@ -0,0 +1,407 @@
"""Tests for the ImageHub dataset routes (milestone 1 walking skeleton).
Pure-helper unit tests always run. The full integration test (create dataset → upload with
content-addressed dedup → version snapshot → owner/admin authz → audit) runs only when BOTH:
- INITIATIVE_DATABASE_URL points at PostgreSQL (asyncpg), and
- S3_ENDPOINT_URL is set (a reachable MinIO; the dev stack maps it to http://localhost:19000).
export INITIATIVE_DATABASE_URL="postgresql+asyncpg://initiative:initiative_secret@127.0.0.1:15432/initiatives"
export S3_ENDPOINT_URL="http://localhost:19000" S3_ACCESS_KEY=minio_user S3_SECRET_KEY=minio_password \\
S3_BUCKET_ATTACHMENTS=initiative-attachments S3_BUCKET_EXPORTS=initiative-exports \\
S3_BUCKET_QUARANTINE=initiative-quarantine S3_PUBLIC_ENDPOINT_URL=http://localhost:19000
cd be0 && python -m unittest tests.test_imagehub_datasets -v
Prereq for the integration test: migration 017_imagehub_datasets.sql applied (compose init mount
or scripts/apply_initiative_migrations.py).
"""
from __future__ import annotations
import io
import os
import unittest
import uuid
_RUN_DB = os.getenv("INITIATIVE_DATABASE_URL", "").strip().lower().startswith("postgresql")
_RUN_S3 = bool(os.getenv("S3_ENDPOINT_URL", "").strip())
# Let the module (which imports src.minio.storage → S3Settings()) import even when not running
# against a real MinIO, so the pure-unit tests below can always run. These defaults match the
# dev stack's host-mapped MinIO; the integration test only fires when S3_ENDPOINT_URL was set.
os.environ.setdefault("S3_ENDPOINT_URL", "http://localhost:19000")
os.environ.setdefault("S3_ACCESS_KEY", "minio_user")
os.environ.setdefault("S3_SECRET_KEY", "minio_password")
os.environ.setdefault("S3_BUCKET_ATTACHMENTS", "initiative-attachments")
os.environ.setdefault("S3_BUCKET_EXPORTS", "initiative-exports")
os.environ.setdefault("S3_BUCKET_QUARANTINE", "initiative-quarantine")
os.environ.setdefault("S3_PUBLIC_ENDPOINT_URL", "http://localhost:19000")
class PureHelperTests(unittest.TestCase):
"""No DB / no network — string + sniff helpers."""
def test_build_blob_key_is_content_addressed(self) -> None:
from src.minio.storage import S3Storage
key = S3Storage.build_blob_key("AbCdEf0123456789")
self.assertEqual(key, "blobs/ab/cd/abcdef0123456789")
def test_slugify_strips_diacritics_and_punct(self) -> None:
from src.imagehub_routes import _slugify
self.assertEqual(_slugify("Bộ dữ liệu CT Ngực!! 2026"), "bo-du-lieu-ct-nguc-2026")
self.assertEqual(_slugify(""), "dataset")
def test_safe_logical_path_basename_only(self) -> None:
from src.imagehub_routes import _safe_logical_path
self.assertEqual(_safe_logical_path("/evil/../a b.dcm"), "a_b.dcm")
self.assertEqual(_safe_logical_path("C:\\scans\\series1.nii.gz"), "series1.nii.gz")
self.assertEqual(_safe_logical_path(""), "file")
def test_safe_folder_path_preserves_dirs_rejects_traversal(self) -> None:
from src.imagehub_routes import _safe_folder_path
# the directory is kept (basename dropped) so an uploaded tree round-trips
self.assertEqual(_safe_folder_path("imagesTr/ct_001.nii.gz"), "imagesTr")
self.assertEqual(_safe_folder_path("a/b/c/scan.nii.gz"), "a/b/c")
# no directory component → dataset root
self.assertEqual(_safe_folder_path("readme.txt"), "")
self.assertEqual(_safe_folder_path(""), "")
# leading slash + ".." traversal segments are stripped
self.assertEqual(_safe_folder_path("/evil/../x/y.dcm"), "evil/x")
# backslashes normalise to forward slashes
self.assertEqual(_safe_folder_path("labelsTr\\sub\\m.nii.gz"), "labelsTr/sub")
def test_coerce_tags(self) -> None:
from src.imagehub_routes import _coerce_tags
self.assertEqual(_coerce_tags(["CT", " MRI ", "", 7]), ["CT", "MRI", "7"])
self.assertEqual(_coerce_tags("nope"), [])
def test_coerce_label_map(self) -> None:
from src.imagehub_routes import _coerce_label_map
# valid entries kept + trimmed; non-positive / non-int keys and empty/non-str values dropped
self.assertEqual(
_coerce_label_map(
{"1": " kidney ", "2": "tumor", "0": "bad", "-3": "bad", "x": "bad", "4": "", "+5": "bad", "1_0": "bad"}
),
{"1": "kidney", "2": "tumor"},
)
# integer keys coerce to strings; non-dict input → {}
self.assertEqual(_coerce_label_map({1: "kidney"}), {"1": "kidney"})
self.assertEqual(_coerce_label_map("nope"), {})
self.assertEqual(_coerce_label_map(None), {})
def test_sniff_never_raises_on_non_imaging(self) -> None:
from src.imagehub_routes import _sniff_imaging_meta
# plain bytes → {}; a .dcm name with junk must degrade to {} (never raise)
self.assertEqual(_sniff_imaging_meta("notes.txt", b"hello world", "text/plain"), {})
self.assertIsInstance(_sniff_imaging_meta("x.dcm", b"DICM" + b"\x00" * 200, "application/dicom"), dict)
def _bearer(uid: uuid.UUID, roles: list[str]) -> str:
import jwt
from src.auth_jwt import jwt_secret
return "Bearer " + jwt.encode({"sub": str(uid), "roles": roles, "cv": 0}, jwt_secret(), algorithm="HS256")
def _upload(name: str, data: bytes, ctype: str = "application/octet-stream"):
from starlette.datastructures import Headers, UploadFile
return UploadFile(io.BytesIO(data), filename=name, headers=Headers({"content-type": ctype}))
@unittest.skipUnless(
_RUN_DB and _RUN_S3,
"Set INITIATIVE_DATABASE_URL=postgresql+asyncpg://… and S3_ENDPOINT_URL=… to run the integration test",
)
class ImagehubDatasetDbTests(unittest.IsolatedAsyncioTestCase):
"""End-to-end: create → upload (content-addressed dedup) → version → owner/admin authz → audit."""
async def asyncSetUp(self) -> None:
from src.initiative_db import engine as eng
from src.minio.storage import storage
await eng.dispose_engine()
await eng.init_engine()
try:
await storage.ensure_buckets_exist()
except Exception as exc: # MinIO not reachable → skip rather than error
self.skipTest(f"MinIO not reachable: {exc}")
self._user_ids: list[uuid.UUID] = []
self._dataset_ids: list[uuid.UUID] = []
async def asyncTearDown(self) -> None:
from sqlalchemy import delete
from src.initiative_db import engine as eng
from src.initiative_db.engine import get_session
from src.initiative_db.models import ImagehubDataset, User
async with get_session() as session:
for did in self._dataset_ids:
await session.execute(delete(ImagehubDataset).where(ImagehubDataset.id == did))
for uid in self._user_ids:
await session.execute(delete(User).where(User.id == uid))
await session.commit()
await eng.dispose_engine()
async def _seed_user(self, *, admin: bool = False) -> uuid.UUID:
from src.initiative_db.engine import get_session
from src.initiative_db.models import User
uid = uuid.uuid4()
async with get_session() as session:
session.add(
User(
id=uid,
email=f"ih-{uid.hex[:10]}@ump.edu.vn",
password_hash="x",
full_name=("Quản trị" if admin else "Nhà nghiên cứu") + " Test",
)
)
await session.commit()
self._user_ids.append(uid)
return uid
async def test_dataset_research_project_link(self) -> None:
"""A dataset can be created linked to a research project ("workspace"); the list can be
filtered to that project; bad/foreign project ids are rejected (migration 024)."""
from fastapi import HTTPException
from src.imagehub_routes import DatasetCreateIn, create_dataset, list_datasets
from src.initiative_db.engine import get_session
from src.initiative_db.models import ResearchProject
owner = await self._seed_user()
owner_tok = _bearer(owner, ["viewer"])
# seed a research project ("workspace") owned by the user (cascade-cleaned with the user)
proj_id = uuid.uuid4()
async with get_session() as session:
session.add(ResearchProject(id=proj_id, owner_user_id=owner, title="Đề tài thử nghiệm"))
await session.commit()
# create a dataset linked to the project → the link is persisted
ds = await create_dataset(
DatasetCreateIn(name="Bộ dữ liệu thuộc đề tài", researchProjectId=str(proj_id)),
owner_tok,
)
self._dataset_ids.append(uuid.UUID(ds.id))
self.assertEqual(ds.researchProjectId, str(proj_id))
# a standalone dataset (no project) is still allowed and stays unlinked
ds2 = await create_dataset(DatasetCreateIn(name="Bộ dữ liệu độc lập"), owner_tok)
self._dataset_ids.append(uuid.UUID(ds2.id))
self.assertIsNone(ds2.researchProjectId)
# a non-existent project id is rejected (422)
with self.assertRaises(HTTPException) as ctx:
await create_dataset(
DatasetCreateIn(name="x", researchProjectId=str(uuid.uuid4())), owner_tok
)
self.assertEqual(ctx.exception.status_code, 422)
# ?projectId= filters the list to that project only (3rd positional arg = projectId)
in_proj = await list_datasets("mine", owner_tok, str(proj_id))
ids_in_proj = [d.id for d in in_proj]
self.assertIn(ds.id, ids_in_proj)
self.assertNotIn(ds2.id, ids_in_proj)
self.assertTrue(all(d.researchProjectId == str(proj_id) for d in in_proj))
async def test_update_label_map_sanitizes_and_persists(self) -> None:
"""update_dataset accepts a per-value label map, sanitizes it, and round-trips it (migration 027)."""
from src.imagehub_routes import (
DatasetCreateIn,
DatasetUpdateIn,
create_dataset,
get_dataset,
update_dataset,
)
owner = await self._seed_user()
owner_tok = _bearer(owner, ["viewer"])
ds = await create_dataset(DatasetCreateIn(name="KiTS labels"), owner_tok)
self._dataset_ids.append(uuid.UUID(ds.id))
self.assertEqual(ds.labelMap, {}) # empty by default
# garbage keys/values are dropped; valid ones trimmed + kept
updated = await update_dataset(
ds.id,
DatasetUpdateIn(labelMap={"1": "kidney", "2": "tumor", "3": "cyst", "0": "bad", "x": "bad"}),
owner_tok,
)
self.assertEqual(updated.labelMap, {"1": "kidney", "2": "tumor", "3": "cyst"})
# persisted: a fresh read returns the same map
fresh = await get_dataset(ds.id, owner_tok)
self.assertEqual(fresh.labelMap, {"1": "kidney", "2": "tumor", "3": "cyst"})
async def test_review_persists_decision_and_stats(self) -> None:
"""review_task writes a structured review event; review-stats tallies it per reviewer (025)."""
from sqlalchemy import select
from src.imagehub_routes import ReviewIn, review_stats, review_task
from src.initiative_db.engine import get_session
from src.initiative_db.models import (
ImagehubBlob,
ImagehubDataset,
ImagehubDatasetFile,
ImagehubDatasetStage,
ImagehubTask,
ImagehubTaskReviewEvent,
)
owner = await self._seed_user()
owner_tok = _bearer(owner, ["viewer"])
# build the minimal chain (no upload): dataset + a Review stage + a file + a task already
# advanced to that Review stage, assigned to the owner.
ds_id, stage_id, file_id, task_id = (uuid.uuid4() for _ in range(4))
sha = uuid.uuid4().hex
async with get_session() as session:
session.add(ImagehubDataset(id=ds_id, owner_user_id=owner, name="Review demo"))
session.add(
ImagehubDatasetStage(id=stage_id, dataset_id=ds_id, name="Rà soát 1", kind="review", seq=1)
)
session.add(ImagehubBlob(sha256=sha, size_bytes=1))
session.add(
ImagehubDatasetFile(id=file_id, dataset_id=ds_id, logical_path="ct.nii.gz", blob_sha256=sha)
)
session.add(
ImagehubTask(
id=task_id, dataset_id=ds_id, dataset_file_id=file_id, name="ct.nii.gz",
current_stage_id=stage_id, pipeline_state="inReview", queue_status="assigned",
assignee_user_id=owner,
)
)
await session.commit()
self._dataset_ids.append(ds_id) # cascade-cleans stage/file/task/events in teardown
# accept the review → a structured event is persisted (decision + reviewer + stage + note)
await review_task(str(ds_id), str(task_id), ReviewIn(decision="accept", note="Đạt"), owner_tok)
async with get_session() as session:
evs = (
await session.execute(
select(ImagehubTaskReviewEvent).where(ImagehubTaskReviewEvent.task_id == task_id)
)
).scalars().all()
self.assertEqual(len(evs), 1)
self.assertEqual(evs[0].decision, "accept")
self.assertEqual(evs[0].reviewer_user_id, owner)
self.assertEqual(evs[0].stage_id, stage_id)
self.assertEqual(evs[0].note, "Đạt")
# the stats endpoint tallies it for the reviewer (authorization is the LAST positional arg)
stats = await review_stats(str(ds_id), str(owner), 30, owner_tok)
self.assertEqual(stats.accepted, 1)
self.assertEqual(stats.rejected, 0)
# a foreign reviewer has no tally
empty = await review_stats(str(ds_id), str(uuid.uuid4()), 30, owner_tok)
self.assertEqual(empty.accepted, 0)
async def test_create_upload_dedup_version_authz_audit(self) -> None:
from fastapi import HTTPException
from sqlalchemy import func, select
from src.imagehub_routes import (
DatasetCreateIn,
VersionCreateIn,
create_dataset,
create_version,
get_dataset,
list_audit,
list_datasets,
list_files,
list_versions,
upload_files,
)
from src.initiative_db.engine import get_session
from src.initiative_db.models import ImagehubBlob, ImagehubDatasetFile
owner = await self._seed_user()
admin = await self._seed_user(admin=True)
other = await self._seed_user()
owner_tok = _bearer(owner, ["viewer"])
admin_tok = _bearer(admin, ["admin"])
other_tok = _bearer(other, ["viewer"])
# create
ds = await create_dataset(
DatasetCreateIn(name="CT Ngực thử nghiệm", description="demo", modalityTags=["CT"]),
owner_tok,
)
self._dataset_ids.append(uuid.UUID(ds.id))
self.assertEqual(ds.name, "CT Ngực thử nghiệm")
self.assertEqual(ds.modalityTags, ["CT"])
self.assertEqual(ds.fileCount, 0)
# upload the SAME content under two names → content-addressed dedup
blob_bytes = uuid.uuid4().bytes * 64 # unique per run
res = await upload_files(
ds.id, [_upload("scan_a.bin", blob_bytes), _upload("scan_b.bin", blob_bytes)], owner_tok
)
self.assertTrue(res["ok"])
shas = {f["sha256"] for f in res["files"]}
self.assertEqual(len(shas), 1, "same content must hash to one sha256")
deduped_flags = sorted(f["deduped"] for f in res["files"])
self.assertEqual(deduped_flags, [False, True], "first stores the blob, second dedups")
# DB: exactly one blob row for that sha256, two file rows for the dataset
sha = next(iter(shas))
async with get_session() as session:
blob_count = (
await session.execute(
select(func.count()).select_from(ImagehubBlob).where(ImagehubBlob.sha256 == sha)
)
).scalar_one()
file_count = (
await session.execute(
select(func.count())
.select_from(ImagehubDatasetFile)
.where(ImagehubDatasetFile.dataset_id == uuid.UUID(ds.id))
)
).scalar_one()
self.assertEqual(blob_count, 1)
self.assertEqual(file_count, 2)
# browse files (each carries a presigned download URL)
files = await list_files(ds.id, owner_tok)
self.assertEqual(len(files), 2)
self.assertTrue(all(f.downloadUrl for f in files))
# authz: a non-admin other user can't see or read it
owner_list = await list_datasets("mine", owner_tok)
self.assertIn(ds.id, [d.id for d in owner_list])
other_list = await list_datasets("all", other_tok) # non-admin: scope=all ignored
self.assertNotIn(ds.id, [d.id for d in other_list])
with self.assertRaises(HTTPException) as ctx:
await get_dataset(ds.id, other_tok)
self.assertEqual(ctx.exception.status_code, 404)
# admin sees every dataset (the clinical data repository)
admin_list = await list_datasets("all", admin_tok)
self.assertIn(ds.id, [d.id for d in admin_list])
# version snapshot freezes the 2-file manifest
ver = await create_version(ds.id, VersionCreateIn(message="phiên bản đầu"), owner_tok)
self.assertEqual(ver.seq, 1)
self.assertEqual(ver.fileCount, 2)
versions = await list_versions(ds.id, owner_tok)
self.assertEqual(len(versions), 1)
# audit trail recorded each mutation
audit = await list_audit(ds.id, owner_tok)
actions = [a.action for a in audit]
self.assertIn("Tạo bộ dữ liệu", actions)
self.assertIn("Tải tệp lên", actions)
self.assertIn("Tạo phiên bản", actions)
if __name__ == "__main__":
unittest.main()