From 27b4c24f181d8fa6f6198aecd5586e4059c1539e Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 17 Sep 2024 14:12:10 +1000 Subject: [PATCH 1/7] adds realistic test --- real-tests/usyd_transfer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/real-tests/usyd_transfer.py b/real-tests/usyd_transfer.py index 5afe624..d143b51 100644 --- a/real-tests/usyd_transfer.py +++ b/real-tests/usyd_transfer.py @@ -1,4 +1,3 @@ -import os from click.testing import CliRunner from xnat_ingest.cli import transfer from xnat_ingest.utils import show_cli_trace From 53a0c13b60295dd288304103eae8d7a78649ad42 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 17 Sep 2024 17:30:52 +1000 Subject: [PATCH 2/7] made ImagingSession file-format agnostic --- xnat_ingest/cli/stage.py | 90 +++++++++-- xnat_ingest/dicom.py | 37 ++--- xnat_ingest/exceptions.py | 4 +- xnat_ingest/session.py | 313 ++++++++++++++++++++++++-------------- 4 files changed, 285 insertions(+), 159 deletions(-) diff --git a/xnat_ingest/cli/stage.py b/xnat_ingest/cli/stage.py index c6ac8ae..d184401 100644 --- a/xnat_ingest/cli/stage.py +++ b/xnat_ingest/cli/stage.py @@ -6,7 +6,7 @@ from tqdm import tqdm from xnat_ingest.cli.base import cli from xnat_ingest.session import ImagingSession -from arcana.xnat import Xnat +from frametree.xnat import Xnat from xnat_ingest.utils import ( DicomField, AssociatedFiles, @@ -30,31 +30,76 @@ are uploaded to XNAT """, ) -@click.argument("dicoms_path", type=str, envvar="XNAT_INGEST_STAGE_DICOMS_PATH") +@click.argument("files_path", type=str, envvar="XNAT_INGEST_STAGE_DICOMS_PATH") @click.argument( "staging_dir", type=click.Path(path_type=Path), envvar="XNAT_INGEST_STAGE_DIR" ) +@click.option( + "--datatype", + type=str, + metavar="", + multiple=True, + default="medimage/dicom-series", + envvar="XNAT_INGEST_STAGE_DATATYPE", + help="The datatype of the primary files to to upload", +) @click.option( "--project-field", - type=DicomField, + type=str, default="StudyID", envvar="XNAT_INGEST_STAGE_PROJECT", - help=("The keyword or tag of the DICOM field to extract the XNAT project ID from "), + help=("The keyword of the metadata field to extract the XNAT project ID from "), ) @click.option( "--subject-field", - type=DicomField, + type=str, default="PatientID", envvar="XNAT_INGEST_STAGE_SUBJECT", - help=("The keyword or tag of the DICOM field to extract the XNAT subject ID from "), + help=("The keyword of the metadata field to extract the XNAT subject ID from "), ) @click.option( "--visit-field", - type=DicomField, + type=str, default="AccessionNumber", + envvar="XNAT_INGEST_STAGE_VISIT", + help=( + "The keyword of the metadata field to extract the XNAT imaging session ID from " + ), +) +@click.option( + "--session-field", + type=str, + default=None, envvar="XNAT_INGEST_STAGE_SESSION", help=( - "The keyword or tag of the DICOM field to extract the XNAT imaging session ID from " + "The keyword of the metadata field to extract the XNAT imaging session ID from " + ), +) +@click.option( + "--scan-id-field", + type=str, + default="SeriesNumber", + envvar="XNAT_INGEST_STAGE_SCAN_ID", + help=( + "The keyword of the metadata field to extract the XNAT imaging scan ID from " + ), +) +@click.option( + "--scan-desc-field", + type=str, + default="SeriesDescription", + envvar="XNAT_INGEST_STAGE_SCAN_DESC", + help=( + "The keyword of the metadata field to extract the XNAT imaging scan description from " + ), +) +@click.option( + "--resource-field", + type=str, + default="ImageType", + envvar="XNAT_INGEST_STAGE_RESOURCE", + help=( + "The keyword of the metadata field to extract the XNAT imaging resource ID from " ), ) @click.option( @@ -68,6 +113,7 @@ type=AssociatedFiles.cli_type, nargs=2, default=None, + multiple=True, envvar="XNAT_INGEST_STAGE_ASSOCIATED", metavar=" ", help=( @@ -181,12 +227,17 @@ type=bool, ) def stage( - dicoms_path: str, + files_path: str, staging_dir: Path, + datatype: str, associated_files: AssociatedFiles, - project_field: DicomField, - subject_field: DicomField, - visit_field: DicomField, + project_field: str, + subject_field: str, + visit_field: str, + session_field: str | None, + scan_id_field: str, + scan_desc_field: str, + resource_field: str, project_id: str | None, delete: bool, log_level: str, @@ -219,7 +270,10 @@ def stage( else: project_list = None - msg = f"Loading DICOM sessions from '{dicoms_path}'" + if session_field is None and datatype == "medimage/dicom-series": + session_field = "StudyInstanceUID" + + msg = f"Loading {datatype} sessions from '{files_path}'" if associated_files: msg += f" with associated files selected from '{associated_files.glob}'" @@ -228,17 +282,21 @@ def stage( logger.info(msg) - sessions = ImagingSession.from_dicoms( - dicoms_path=dicoms_path, + sessions = ImagingSession.from_paths( + files_path=files_path, project_field=project_field, subject_field=subject_field, visit_field=visit_field, + session_field=session_field, + scan_id_field=scan_id_field, + scan_desc_field=scan_desc_field, + resource_field=resource_field, project_id=project_id, ) logger.info("Staging sessions to '%s'", str(staging_dir)) - for session in tqdm(sessions, f"Staging DICOM sessions found in '{dicoms_path}'"): + for session in tqdm(sessions, f"Staging DICOM sessions found in '{files_path}'"): try: session_staging_dir = staging_dir.joinpath(*session.staging_relpath) if session_staging_dir.exists(): diff --git a/xnat_ingest/dicom.py b/xnat_ingest/dicom.py index 8ae6672..2708b38 100644 --- a/xnat_ingest/dicom.py +++ b/xnat_ingest/dicom.py @@ -1,18 +1,20 @@ import typing as ty import subprocess as sp + # import re import pydicom + # from fileformats.core import FileSet # from fileformats.application import Dicom # from fileformats.extras.application.medical import dicom_read_metadata - +dcmedit_path: ty.Optional[str] try: dcmedit_path = sp.check_output("which dcmedit", shell=True).decode("utf-8").strip() except sp.CalledProcessError: dcmedit_path = None - +dcminfo_path: ty.Optional[str] try: dcminfo_path = sp.check_output("which dcminfo", shell=True).decode("utf-8").strip() except sp.CalledProcessError: @@ -20,11 +22,14 @@ def tag2keyword(tag: ty.Tuple[str, str]) -> str: - return pydicom.datadict.dictionary_keyword(tag) + return pydicom.datadict.dictionary_keyword((int(tag[0]), int(tag[1]))) def keyword2tag(keyword: str) -> ty.Tuple[str, str]: - tag_str = hex(pydicom.datadict.tag_for_keyword(keyword))[2:] + tag = pydicom.datadict.tag_for_keyword(keyword) + if not tag: + raise ValueError(f"Could not find tag for keyword '{keyword}'") + tag_str = hex(tag)[2:] return (f"{tag_str[:-4].zfill(4)}", tag_str[-4:]) @@ -49,27 +54,3 @@ def __init__(self, keyword_or_tag): def __str__(self): return f"'{self.keyword}' field ({','.join(self.tag)})" - - -# @FileSet.read_metadata.register -# def mrtrix_dicom_read_metadata( -# dcm: Dicom, selected_keys: ty.Optional[ty.Sequence[str]] = None -# ) -> ty.Mapping[str, ty.Any]: -# if dcminfo_path is None or selected_keys is None: -# return dicom_read_metadata(dcm, selected_keys) - -# tags = [keyword2tag(k) for k in selected_keys] -# tag_str = " ".join(f"-t {t[0]} {t[1]}" for t in tags) -# cmd = f"dcminfo {tag_str} {dcm.fspath}" -# line_re = re.compile(r"\[([0-9A-F]{4}),([0-9A-F]{4})] (.*)") -# dcminfo_output = sp.check_output(cmd, shell=True).decode("utf-8") -# metadata = {} -# for line in dcminfo_output.splitlines(): -# match = line_re.match(line) -# if not match: -# continue -# t1, t2, val = match.groups() -# key = tag2keyword((t1, t2)) -# val = val.strip() -# metadata[key] = val -# return metadata diff --git a/xnat_ingest/exceptions.py b/xnat_ingest/exceptions.py index 9b6631a..e1b1711 100644 --- a/xnat_ingest/exceptions.py +++ b/xnat_ingest/exceptions.py @@ -1,5 +1,3 @@ - - class UnsupportedModalityError(Exception): def __init__(self, msg): self.msg = msg @@ -10,7 +8,7 @@ def __init__(self, msg): self.msg = msg -class DicomParseError(StagingError): +class ImagingSessionParseError(StagingError): def __init__(self, msg): self.msg = msg diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index df8d4e9..468522f 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -5,28 +5,29 @@ import os.path import subprocess as sp from functools import cached_property +from typing_extensions import Self import shutil from copy import deepcopy import yaml from tqdm import tqdm import attrs from itertools import chain -from collections import defaultdict +from collections import defaultdict, Counter from pathlib import Path import pydicom from fileformats.generic import File from fileformats.application import Dicom from fileformats.medimage import DicomSeries from fileformats.core import from_paths, FileSet, DataType, from_mime, to_mime -from arcana.core.data.set import Dataset -from arcana.core.data.space import DataSpace -from arcana.core.data.row import DataRow -from arcana.core.data.store import DataStore -from arcana.core.data.entry import DataEntry -from arcana.core.data.tree import DataTree -from arcana.core.exceptions import ArcanaDataMatchError -from .exceptions import DicomParseError, StagingError -from .utils import add_exc_note, transform_paths, DicomField, AssociatedFiles +from frametree.core.frameset import FrameSet # type: ignore[import-untyped] +from frametree.core.axes import Axes # type: ignore[import-untyped] +from frametree.core.row import DataRow # type: ignore[import-untyped] +from frametree.core.store import Store # type: ignore[import-untyped] +from frametree.core.entry import DataEntry # type: ignore[import-untyped] +from frametree.core.tree import DataTree # type: ignore[import-untyped] +from frametree.core.exceptions import FrameTreeDataMatchError # type: ignore[import-untyped] +from .exceptions import ImagingSessionParseError, StagingError +from .utils import add_exc_note, transform_paths, AssociatedFiles from .dicom import dcmedit_path import random import string @@ -57,6 +58,9 @@ def scans_converter( scans: ty.Union[ty.Sequence[ImagingScan], ty.Dict[str, ImagingScan]] ): if isinstance(scans, ty.Sequence): + duplicates = [i for i, c in Counter(s.id for s in scans).items() if c > 1] + if duplicates: + raise ValueError(f"Found duplicate scan IDs in list of scans: {duplicates}") scans = {s.id: s for s in scans} return scans @@ -121,14 +125,14 @@ def dicom_dirs(self) -> ty.List[Path]: def select_resources( self, - dataset: ty.Optional[Dataset], + dataset: ty.Optional[FrameSet], always_include: ty.Sequence[str] = (), ) -> ty.Iterator[ty.Tuple[str, str, str, FileSet]]: """Returns selected resources that match the columns in the dataset definition Parameters ---------- - dataset : Dataset + dataset : FrameSet Arcana dataset definition always_include : sequence[str] mime-types or "mime-like" (see https://arcanaframework.github.io/fileformats/) @@ -151,14 +155,18 @@ def select_resources( "Either 'dataset' or 'always_include' must be specified to select " f"appropriate resources to upload from {self.name} session" ) - store = MockDataStore(self) + store = MockStore(self) uploaded = set() for mime_like in always_include: if mime_like == "all": fileformat = FileSet else: - fileformat = from_mime(mime_like) + fileformat = from_mime(mime_like) # type: ignore[assignment] + if isinstance(fileformat, FileSet): + raise ValueError( + f"{mime_like!r} does not correspond to a file format ({fileformat})" + ) for scan in self.scans.values(): for resource_name, fileset in scan.resources.items(): if isinstance(fileset, fileformat): @@ -168,7 +176,7 @@ def select_resources( for column in dataset.columns.values(): try: entry = column.match_entry(store.row) - except ArcanaDataMatchError as e: + except FrameTreeDataMatchError as e: raise StagingError( f"Did not find matching entry for {column} column in {dataset} from " f"{self.name} session" @@ -213,32 +221,52 @@ def metadata(self): return collated @classmethod - def from_dicoms( + def from_paths( cls, - dicoms_path: str | Path, - project_field: DicomField = DicomField("StudyID"), - subject_field: DicomField = DicomField("PatientID"), - visit_field: DicomField = DicomField("AccessionNumber"), + files_path: str | Path, + datatypes: ty.Union[ + ty.Type[FileSet], ty.Sequence[ty.Type[FileSet]] + ] = DicomSeries, + project_field: str = "StudyID", + subject_field: str = "PatientID", + visit_field: str = "AccessionNumber", + scan_id_field: str = "SeriesNumber", + scan_desc_field: str = "SeriesDescription", + resource_field: str = "ImageType", + session_field: str | None = "StudyInstanceUID", project_id: str | None = None, - ) -> ty.List["ImagingSession"]: + ) -> ty.List[Self]: """Loads all imaging sessions from a list of DICOM files Parameters ---------- - dicoms_path : str or Path - Path to a directory containging the DICOMS to load the sessions from, or a + files_path : str or Path + Path to a directory containging the resources to load the sessions from, or a glob string that selects the paths project_field : str - the name of the DICOM field that is to be interpreted as the corresponding - XNAT project + the metadata field that contains the XNAT project ID for the imaging session, + by default "StudyID" subject_field : str - the name of the DICOM field that is to be interpreted as the corresponding - XNAT project + the metadata field that contains the XNAT subject ID for the imaging session, + by default "PatientID" visit_field : str - the name of the DICOM field that is to be interpreted as the corresponding - XNAT project + the metadata field that contains the XNAT visit ID for the imaging session, + by default "AccessionNumber" + scan_id_field: str + the metadata field that contains the XNAT scan ID for the imaging session, + by default "SeriesNumber" + scan_desc_field: str + the metadata field that contains the XNAT scan description for the imaging session, + by default "SeriesDescription" + resource_field: str + the metadata field that contains the XNAT resource ID for the imaging session, + by default "ImageType" + session_field : str, optional + the name of the metadata field that uniquely identifies the session, used + to check that the values extracted from the IDs across the DICOM scans are + consistent across DICOM files within the session, by default "StudyInstanceUID" project_id : str - Override the project ID loaded from the DICOM header (useful when invoking + Override the project ID loaded from the metadata (useful when invoking manually) Returns @@ -248,99 +276,160 @@ def from_dicoms( Raises ------ - DicomParseError + ImagingSessionParseError if values extracted from IDs across the DICOM scans are not consistent across DICOM files within the session """ - if isinstance(dicoms_path, Path) or "*" not in dicoms_path: - dicoms_path = Path(dicoms_path) - if not dicoms_path.exists(): - raise ValueError(f"Provided DICOMs path '{dicoms_path}' does not exist") - if dicoms_path.is_dir(): - dicom_fspaths = list(Path(dicoms_path).iterdir()) + if isinstance(files_path, Path) or "*" not in files_path: + files_path = Path(files_path) + if not files_path.exists(): + raise ValueError(f"Provided DICOMs path '{files_path}' does not exist") + if files_path.is_dir(): + fspaths = list(Path(files_path).iterdir()) else: - dicom_fspaths = [dicoms_path] + fspaths = [files_path] else: - dicom_fspaths = [Path(p) for p in glob(dicoms_path, recursive=True)] + fspaths = [Path(p) for p in glob(files_path, recursive=True)] + + from_paths_kwargs = {} + if datatypes is DicomSeries: + from_paths_kwargs["specific_tags"] = [ + project_field, + subject_field, + visit_field, + session_field, + scan_id_field, + scan_desc_field, + resource_field, + ] + + if not isinstance(datatypes, ty.Sequence): + datatypes = [datatypes] # Sort loaded series by StudyInstanceUID (imaging session) - logger.info("Loading DICOM series from %s", str(dicoms_path)) - dicom_sessions = defaultdict(list) - for series in from_paths( - dicom_fspaths, - DicomSeries, + logger.info(f"Loading {datatypes} from {files_path}...") + resources = from_paths( + fspaths, + *datatypes, ignore=".*", - selected_keys=[ - "SeriesNumber", - "SeriesDescription", - "StudyInstanceUID", - "SOPInstanceUID", # used in ordering the contents of the dicom series - project_field.keyword, - subject_field.keyword, - visit_field.keyword, - ], + **from_paths_kwargs, # type: ignore[arg-type] + ) + sessions: ty.Dict[ty.Tuple[str, str, str] | str, Self] = {} + multiple_sessions: ty.DefaultDict[str, ty.Set[ty.Tuple[str, str, str]]] = ( + defaultdict(set) + ) + multiple_scan_types: ty.DefaultDict[ + ty.Tuple[str, str, str, str], ty.Set[str] + ] = defaultdict(set) + multiple_resources: ty.DefaultDict[ + ty.Tuple[str, str, str, str, str], ty.Set[str] + ] = defaultdict(set) + for resource in tqdm( + resources, + "Sorting resources into XNAT tree structure...", ): - # Restrict the metadata fields that are loaded (others are ignored), - # for performance reasons - dicom_sessions[series.metadata["StudyInstanceUID"]].append(series) - - # Construct sessions from sorted series - logger.info("Searching for associated files ") - sessions = [] - for session_dicom_series in dicom_sessions.values(): - - def get_id(field): - ids = set(s.metadata.get(field.keyword) for s in session_dicom_series) - ids.discard(None) - if ids: - if len(ids) > 1: - raise DicomParseError( - f"Multiple values for '{field}' tag found across scans in session: " - f"{session_dicom_series}" - ) - id_ = next(iter(ids)) - if isinstance(id_, list): - raise DicomParseError( - f"Multiple values for '{field}' tag found within scans in session: " - f"{session_dicom_series}" + session_uid = resource.metadata[session_field] if session_field else None + + def get_id(field_type: str, field_name: str) -> str: + try: + value = resource.metadata[field_name] + except KeyError: + if session_uid and field_type in ("project", "subject", "visit"): + value = ( + "INVALID-MISSING-" + + field_type.upper() + + "-" + + "".join( + random.choices( + string.ascii_letters + string.digits, k=8 + ) + ) ) - id_ = cls.id_escape_re.sub("", id_) - else: - logger.warning( - "Did not find %s field in DICOM series %s", - field.keyword, - session_dicom_series, + raise ImagingSessionParseError( + f"Did not find '{field_name}' field in {resource}, " + "cannot uniquely identify the resource" + ) + return value + + if not project_id: + project_id = get_id("project", project_field) + subject_id = get_id("subject", subject_field) + visit_id = get_id("visit", visit_field) + scan_id = get_id("scan", scan_id_field) + scan_type = get_id("scan type", scan_desc_field) + resource_id = get_id("resource", resource_field) + + if session_uid is None: + session_uid = (project_id, subject_id, visit_id) + try: + session = sessions[session_uid] + except KeyError: + session = cls( + project_id=project_id, + subject_id=subject_id, + visit_id=visit_id, + ) + sessions[session_uid] = session + else: + if (session.project_id, session.subject_id, session.visit_id) != ( + project_id, + subject_id, + visit_id, + ): + # Record all issues with the session IDs for raising exception at the end + multiple_sessions[session_uid].add( + (project_id, subject_id, visit_id) ) - id_ = None - if not id_: - id_ = "INVALID-MISSING-ID-" + "".join( - random.choices(string.ascii_letters + string.digits, k=8) + multiple_sessions[session_uid].add( + (session.project_id, session.subject_id, session.visit_id) ) - return id_ - scans = [] - for dicom_series in session_dicom_series: - series_description = dicom_series.metadata["SeriesDescription"] - if isinstance(series_description, list): - series_description = series_description[0] - scans.append( - ImagingScan( - id=str(dicom_series.metadata["SeriesNumber"]), - type=str(series_description), - resources={"DICOM": dicom_series}, - ) + try: + scan = session.scans[scan_id] + except KeyError: + scan = ImagingScan(id=scan_id, type=scan_type, resources={}) + session.scans[scan_id] = scan + else: + if scan.type != scan_type: + # Record all issues with the scan types for raising exception at the end + multiple_scan_types[ + (project_id, subject_id, visit_id, scan_id) + ].add(scan_type) + + if resource_id in scan.resources: + multiple_resources[ + (project_id, subject_id, visit_id, scan_id, scan_type) + ].add(resource_id) + scan.resources[resource_id] = resource + + if multiple_sessions: + raise ImagingSessionParseError( + "Multiple sessions found with the same project/subject/visit ID triplets: " + + "\n".join( + f"{i} -> {p}:{s}:{v}" for i, (p, s, v) in multiple_sessions.items() ) + ) - sessions.append( - cls( - scans=scans, - project_id=(project_id if project_id else get_id(project_field)), - subject_id=get_id(subject_field), - visit_id=get_id(visit_field), + if multiple_scan_types: + raise ImagingSessionParseError( + "Multiple scans found with the same project/subject/visit/scan ID " + "quadruplets: " + + "\n".join( + f"{p}:{s}:{v}:{sc} -> " + ", ".join(st) + for (p, s, v, sc), st in multiple_scan_types.items() + ) + ) + if multiple_resources: + raise ImagingSessionParseError( + "Multiple resources found with the same project/subject/visit/scan/resource " + "ID quintuplets: " + + "\n".join( + f"{p}:{s}:{v}:{sc}:{r} -> " + ", ".join(rs) + for (p, s, v, sc, r), rs in multiple_resources.items() ) ) - return sessions + return list(sessions.values()) @classmethod def load( @@ -391,7 +480,7 @@ def load( id=scan_id, type=scan_dict["type"], resources={ - n: from_mime(d["datatype"])( + n: from_mime(d["datatype"])( # type: ignore[call-arg, misc] session_dir.joinpath(*p.split("/")) for p in d["fspaths"] ) @@ -481,7 +570,7 @@ def save(self, save_dir: Path, just_manifest: bool = False) -> "ImagingSession": ) saved.scans[scan.id].resources[resource_name] = fileset resources_dict[resource_name] = { - "datatype": to_mime(fileset, official=False), + "datatype": to_mime(type(fileset), official=False), "fspaths": [ # Ensure it is a relative path using POSIX forward slashes str(p.absolute().relative_to(session_dir)).replace("\\", "/") @@ -508,7 +597,7 @@ def stage( deidentify: bool = True, project_list: ty.Optional[ty.List[str]] = None, spaces_to_underscores: bool = False, - ) -> "ImagingSession": + ) -> Self: r"""Stages and deidentifies files by removing the fields listed `FIELDS_TO_ANONYMISE` and replacing birth date with 01/01/ and returning new imaging session @@ -567,7 +656,7 @@ def stage( for scan in tqdm( self.scans.values(), f"Staging DICOM sessions to {session_dir}" ): - staged_resources = {} + staged_resources: ty.Dict[str, FileSet] = {} for resource_name, fileset in scan.resources.items(): # Ensure scan type is a valid directory name scan_dir = session_dir / f"{scan.id}-{scan.type}" / resource_name @@ -842,7 +931,7 @@ def deidentify_dicom( @attrs.define -class MockDataStore(DataStore): +class MockStore(Store): """Mock data store so we can use the column.match_entry method on the "entries" in the data row """ @@ -853,7 +942,7 @@ class MockDataStore(DataStore): def row(self): return DataRow( ids={DummySpace._: None}, - dataset=Dataset(id=None, store=self, hierarchy=[], space=DummySpace), + dataset=FrameSet(id=None, store=self, hierarchy=[], space=DummySpace), frequency=DummySpace._, ) @@ -957,5 +1046,5 @@ def create_entry(self, path: str, datatype: type, row: DataRow) -> DataEntry: raise NotImplementedError -class DummySpace(DataSpace): +class DummySpace(Axes): _ = 0b0 From d79e6e3ca3d226073838e602e12a07918becb8af Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 17 Sep 2024 18:28:17 +1000 Subject: [PATCH 3/7] fixed typing --- xnat_ingest/cli/stage.py | 5 ++--- xnat_ingest/session.py | 14 +++++++------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/xnat_ingest/cli/stage.py b/xnat_ingest/cli/stage.py index d184401..05d74fb 100644 --- a/xnat_ingest/cli/stage.py +++ b/xnat_ingest/cli/stage.py @@ -6,9 +6,8 @@ from tqdm import tqdm from xnat_ingest.cli.base import cli from xnat_ingest.session import ImagingSession -from frametree.xnat import Xnat +from frametree.xnat import Xnat # type: ignore[import-untyped] from xnat_ingest.utils import ( - DicomField, AssociatedFiles, logger, LogFile, @@ -39,7 +38,7 @@ type=str, metavar="", multiple=True, - default="medimage/dicom-series", + default=["medimage/dicom-series"], envvar="XNAT_INGEST_STAGE_DATATYPE", help="The datatype of the primary files to to upload", ) diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index 468522f..a95914e 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -5,15 +5,18 @@ import os.path import subprocess as sp from functools import cached_property -from typing_extensions import Self import shutil +import random +import string +import platform from copy import deepcopy -import yaml -from tqdm import tqdm -import attrs from itertools import chain from collections import defaultdict, Counter from pathlib import Path +from typing_extensions import Self +import attrs +from tqdm import tqdm +import yaml import pydicom from fileformats.generic import File from fileformats.application import Dicom @@ -29,9 +32,6 @@ from .exceptions import ImagingSessionParseError, StagingError from .utils import add_exc_note, transform_paths, AssociatedFiles from .dicom import dcmedit_path -import random -import string -import platform logger = logging.getLogger("xnat-ingest") From bb2b215f5c61ea219b9b315743781990be5fdd70 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 17 Sep 2024 21:47:46 +1000 Subject: [PATCH 4/7] debugging unnitests after change to using frametree and switching to flexible formats --- pyproject.toml | 4 ++-- scripts/dcm_performance_mrtrix.py | 6 +++--- scripts/dcm_performance_pydicom.py | 6 +++--- xnat_ingest/cli/stage.py | 10 +++++----- xnat_ingest/cli/upload.py | 12 ++++++------ xnat_ingest/session.py | 14 +++++++------- xnat_ingest/tests/test_cli.py | 18 +++++++++--------- xnat_ingest/tests/test_dicom.py | 19 +++++++++++-------- xnat_ingest/tests/test_session.py | 26 +++++++++++++------------- 9 files changed, 59 insertions(+), 56 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 042b63d..a23080d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,8 +16,8 @@ dependencies = [ "natsort", "paramiko", "xnat", - "arcana", - "arcana-xnat >=0.4.1", + "frametree", + "frametree-xnat", ] license = { file = "LICENSE" } authors = [{ name = "Thomas G. Close", email = "thomas.close@sydney.edu.au" }] diff --git a/scripts/dcm_performance_mrtrix.py b/scripts/dcm_performance_mrtrix.py index 63c17a4..8947934 100644 --- a/scripts/dcm_performance_mrtrix.py +++ b/scripts/dcm_performance_mrtrix.py @@ -9,9 +9,9 @@ "StudyInstanceUID", "StudyID", "PatientID", - "AccessionNumber" + "AccessionNumber", ] -series = DicomSeries(get_image().iterdir()) +series = DicomSeries(get_image().iterdir(), specific_tags=METADATA_KEYS) -timeit.timeit(lambda: series.select_metadata(METADATA_KEYS)) +timeit.timeit(lambda: series.metadata) diff --git a/scripts/dcm_performance_pydicom.py b/scripts/dcm_performance_pydicom.py index 6f17717..b5423d3 100644 --- a/scripts/dcm_performance_pydicom.py +++ b/scripts/dcm_performance_pydicom.py @@ -8,9 +8,9 @@ "StudyInstanceUID", "StudyID", "PatientID", - "AccessionNumber" + "AccessionNumber", ] -series = DicomSeries(get_image().iterdir()) +series = DicomSeries(get_image().iterdir(), specific_tags=METADATA_KEYS) -timeit.timeit(lambda: series.select_metadata(METADATA_KEYS)) +timeit.timeit(lambda: series.metadata) diff --git a/xnat_ingest/cli/stage.py b/xnat_ingest/cli/stage.py index 05d74fb..0b5083a 100644 --- a/xnat_ingest/cli/stage.py +++ b/xnat_ingest/cli/stage.py @@ -229,7 +229,7 @@ def stage( files_path: str, staging_dir: Path, datatype: str, - associated_files: AssociatedFiles, + associated_files: ty.List[AssociatedFiles], project_field: str, subject_field: str, visit_field: str, @@ -274,10 +274,10 @@ def stage( msg = f"Loading {datatype} sessions from '{files_path}'" - if associated_files: - msg += f" with associated files selected from '{associated_files.glob}'" - if not associated_files.glob.startswith("/"): - msg += " (relative to the directories in which the DICOMs are found)" + for assoc_files in associated_files: + msg += f" with associated files selected from '{assoc_files.glob}'" + if not assoc_files.glob.startswith("/"): + msg += " (relative to the directories in which the primary files are found)" logger.info(msg) diff --git a/xnat_ingest/cli/upload.py b/xnat_ingest/cli/upload.py index a7e3d3a..dae28a2 100644 --- a/xnat_ingest/cli/upload.py +++ b/xnat_ingest/cli/upload.py @@ -11,13 +11,13 @@ import click from tqdm import tqdm from natsort import natsorted -import xnat +import xnat # type: ignore[import-untyped] import boto3 import paramiko from fileformats.generic import File -from arcana.core.data.set import Dataset -from arcana.xnat import Xnat -from xnat.exceptions import XNATResponseError +from frametree.core.frameset import FrameSet # type: ignore[import-untyped] +from frametree.xnat import Xnat # type: ignore[import-untyped] +from xnat.exceptions import XNATResponseError # type: ignore[import-untyped] from xnat_ingest.cli.base import cli from xnat_ingest.session import ImagingSession from xnat_ingest.utils import ( @@ -349,7 +349,7 @@ def iter_staged_sessions(): missing_datasets = set() for project_id in project_ids: try: - dataset = Dataset.load(project_id, xnat_repo) + dataset = FrameSet.load(project_id, xnat_repo) except Exception: missing_datasets.add(project_id) else: @@ -392,7 +392,7 @@ def iter_staged_sessions(): # Access Arcana dataset associated with project try: - dataset = Dataset.load(session.project_id, xnat_repo) + dataset = FrameSet.load(session.project_id, xnat_repo) except Exception as e: logger.warning( "Did not load dataset definition (%s) from %s project " diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index a95914e..a40ae1b 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -941,9 +941,9 @@ class MockStore(Store): @property def row(self): return DataRow( - ids={DummySpace._: None}, - dataset=FrameSet(id=None, store=self, hierarchy=[], space=DummySpace), - frequency=DummySpace._, + ids={DummyAxes._: None}, + frameset=FrameSet(id=None, store=self, hierarchy=[], axes=DummyAxes), + frequency=DummyAxes._, ) def populate_row(self, row: DataRow): @@ -1011,7 +1011,7 @@ def create_data_tree( id: str, leaves: ty.List[ty.Tuple[str, ...]], hierarchy: ty.List[str], - space: type, + axes: type, **kwargs, ): raise NotImplementedError @@ -1029,12 +1029,12 @@ def put_provenance(self, provenance: ty.Dict[str, ty.Any], entry: DataEntry): def get_provenance(self, entry: DataEntry) -> ty.Dict[str, ty.Any]: raise NotImplementedError - def save_dataset_definition( + def save_frameset_definition( self, dataset_id: str, definition: ty.Dict[str, ty.Any], name: str ): raise NotImplementedError - def load_dataset_definition( + def load_frameset_definition( self, dataset_id: str, name: str ) -> ty.Dict[str, ty.Any]: raise NotImplementedError @@ -1046,5 +1046,5 @@ def create_entry(self, path: str, datatype: type, row: DataRow) -> DataEntry: raise NotImplementedError -class DummySpace(Axes): +class DummyAxes(Axes): _ = 0b0 diff --git a/xnat_ingest/tests/test_cli.py b/xnat_ingest/tests/test_cli.py index ca717d5..b71c491 100644 --- a/xnat_ingest/tests/test_cli.py +++ b/xnat_ingest/tests/test_cli.py @@ -1,28 +1,28 @@ import os import shutil from pathlib import Path -from arcana.core.cli.dataset import ( +from frametree.core.cli import ( # type: ignore[import-untyped] define as dataset_define, add_source as dataset_add_source, ) -import xnat4tests -from arcana.core.cli.store import add as store_add +import xnat4tests # type: ignore[import-untyped] +from frametree.core.cli.store import add as store_add # type: ignore[import-untyped] from xnat_ingest.cli import stage, upload from xnat_ingest.utils import show_cli_trace from fileformats.medimage import DicomSeries -from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_pet_image, ) -from medimages4tests.dummy.dicom.ct.ac.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.dicom.ct.ac.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_ac_image, ) -from medimages4tests.dummy.dicom.pet.topogram.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.dicom.pet.topogram.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_topogram_image, ) -from medimages4tests.dummy.dicom.pet.statistics.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.dicom.pet.statistics.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_statistics_image, ) -from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_files as get_raw_data_files, ) @@ -136,8 +136,8 @@ def test_stage_and_upload( result = cli_runner( store_add, [ + "xnat", "testxnat", - "xnat:Xnat", "--server", xnat_server, "--user", diff --git a/xnat_ingest/tests/test_dicom.py b/xnat_ingest/tests/test_dicom.py index 9769e83..703b088 100644 --- a/xnat_ingest/tests/test_dicom.py +++ b/xnat_ingest/tests/test_dicom.py @@ -1,10 +1,9 @@ import pytest import platform -from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_pet_image, ) from fileformats.medimage import DicomSeries -from xnat_ingest.session import ImagingSession # PATIENT_ID = "patient-id" # STUDY_ID = "study-id" @@ -12,11 +11,13 @@ @pytest.fixture -def dicom_series(scope="module") -> ImagingSession: +def dicom_series(scope="module") -> DicomSeries: return DicomSeries(get_pet_image().iterdir()) -@pytest.mark.xfail(condition=(platform.system() == "Linux"), reason="Not working on ubuntu") +@pytest.mark.xfail( + condition=(platform.system() == "Linux"), reason="Not working on ubuntu" +) def test_mrtrix_dicom_metadata(dicom_series: DicomSeries): keys = [ "AccessionNumber", @@ -26,13 +27,15 @@ def test_mrtrix_dicom_metadata(dicom_series: DicomSeries): "StudyInstanceUID", "SOPInstanceUID", ] - dicom_series.select_metadata(keys) + dicom_series = DicomSeries(dicom_series, specific_tags=keys) - assert sorted(dicom_series.metadata) == sorted(keys + ['SpecificCharacterSet']) + assert sorted(dicom_series.metadata) == sorted(keys + ["SpecificCharacterSet"]) assert dicom_series.metadata["PatientName"] == "GivenName^FamilyName" assert dicom_series.metadata["AccessionNumber"] == "987654321" - assert dicom_series.metadata["PatientID"] == 'Session Label' + assert dicom_series.metadata["PatientID"] == "Session Label" assert dicom_series.metadata["StudyID"] == "PROJECT_ID" assert not isinstance(dicom_series.metadata["StudyInstanceUID"], list) assert isinstance(dicom_series.metadata["SOPInstanceUID"], list) - assert len(dicom_series.metadata["SOPInstanceUID"]) == len(list(dicom_series.contents)) + assert len(dicom_series.metadata["SOPInstanceUID"]) == len( + list(dicom_series.contents) + ) diff --git a/xnat_ingest/tests/test_session.py b/xnat_ingest/tests/test_session.py index 578620a..b3b0fb4 100644 --- a/xnat_ingest/tests/test_session.py +++ b/xnat_ingest/tests/test_session.py @@ -8,29 +8,29 @@ Vnd_Siemens_Biograph128Vision_Vr20b_PetListMode, Vnd_Siemens_Biograph128Vision_Vr20b_PetSinogram, ) -from arcana.core.data.set import Dataset -from arcana.common import DirTree -from medimages4tests.dummy.dicom.base import default_dicom_dir -from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import ( +from frametree.core.frameset import FrameSet # type: ignore[import-untyped] +from frametree.common import FileSystem # type: ignore[import-untyped] +from medimages4tests.dummy.dicom.base import default_dicom_dir # type: ignore[import-untyped] +from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_pet_image, __file__ as pet_src_file, ) -from medimages4tests.dummy.dicom.ct.ac.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.dicom.ct.ac.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_ac_image, __file__ as ac_src_file, ) -from medimages4tests.dummy.dicom.pet.topogram.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.dicom.pet.topogram.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_topogram_image, __file__ as topogram_src_file, ) -from medimages4tests.dummy.dicom.pet.statistics.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.dicom.pet.statistics.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_statistics_image, __file__ as statistics_src_file, ) -from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b import ( +from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_files as get_raw_data_files, ) -from xnat_ingest.session import ImagingSession, ImagingScan, DummySpace +from xnat_ingest.session import ImagingSession, ImagingScan, DummyAxes from xnat_ingest.utils import AssociatedFiles @@ -84,7 +84,7 @@ def imaging_session() -> ImagingSession: @pytest.fixture -def dataset(tmp_path: Path) -> Dataset: +def dataset(tmp_path: Path) -> FrameSet: """For use in tests, this method creates a test dataset from the provided blueprint @@ -104,12 +104,12 @@ def dataset(tmp_path: Path) -> Dataset: passed through to create_dataset """ dataset_path = tmp_path / "a-dataset" - store = DirTree() + store = FileSystem() dataset = store.create_dataset( id=dataset_path, leaves=[], hierarchy=[], - space=DummySpace, + axes=DummyAxes, ) for col_name, col_type, col_pattern in [ ("pet", "medimage/dicom-series", "PET SWB 8MIN"), @@ -139,7 +139,7 @@ def dataset(tmp_path: Path) -> Dataset: condition=platform.system() == "Linux", reason="Not working on ubuntu" ) def test_session_select_resources( - imaging_session: ImagingSession, dataset: Dataset, tmp_path: Path + imaging_session: ImagingSession, dataset: FrameSet, tmp_path: Path ): assoc_dir = tmp_path / "assoc" From e566c81663c20453506be58b5e61fdfd0b5fb187 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Fri, 20 Sep 2024 17:52:25 +1000 Subject: [PATCH 5/7] debugged stage and upload unittest --- conftest.py | 15 +++++- scripts/run_upload.py | 2 +- xnat_ingest/cli/stage.py | 10 ++-- xnat_ingest/cli/upload.py | 1 + xnat_ingest/session.py | 80 +++++++++++++++++++++---------- xnat_ingest/tests/test_cli.py | 37 +++++++------- xnat_ingest/tests/test_session.py | 62 ++++++++++++------------ xnat_ingest/utils.py | 42 ++++++++++------ 8 files changed, 156 insertions(+), 93 deletions(-) diff --git a/conftest.py b/conftest.py index 84be628..33713ed 100644 --- a/conftest.py +++ b/conftest.py @@ -1,13 +1,20 @@ import os from pathlib import Path import logging +import typing as ty import tempfile from logging.handlers import SMTPHandler import pytest from click.testing import CliRunner -import xnat4tests +import xnat4tests # type: ignore[import-untyped] from datetime import datetime from xnat_ingest.utils import logger +from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b.pet_listmode import ( + get_data as get_listmode_data, +) +from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b.pet_countrate import ( + get_data as get_countrate_data, +) # Set DEBUG logging for unittests @@ -110,3 +117,9 @@ def emit(self, record): # Capture the email message and append it to the list msg = self.format(record) self.emails.append(msg) + + +def get_raw_data_files(out_dir: ty.Optional[Path] = None, **kwargs) -> ty.List[Path]: + if out_dir is None: + out_dir = Path(tempfile.mkdtemp()) + return get_listmode_data(out_dir, **kwargs) + get_countrate_data(out_dir, **kwargs) diff --git a/scripts/run_upload.py b/scripts/run_upload.py index 4a2d95e..1d9f38a 100644 --- a/scripts/run_upload.py +++ b/scripts/run_upload.py @@ -3,7 +3,7 @@ from xnat_ingest.utils import show_cli_trace from click.testing import CliRunner -PATTERN = "{PatientName.given_name}_{PatientName.family_name}_{SeriesDate}.*" +PATTERN = "{PatientName.family_name}_{PatientName.given_name}_{SeriesDate}.*" runner = CliRunner() diff --git a/xnat_ingest/cli/stage.py b/xnat_ingest/cli/stage.py index 0b5083a..a6abeaa 100644 --- a/xnat_ingest/cli/stage.py +++ b/xnat_ingest/cli/stage.py @@ -95,7 +95,7 @@ @click.option( "--resource-field", type=str, - default="ImageType", + default="ImageType[-1]", envvar="XNAT_INGEST_STAGE_RESOURCE", help=( "The keyword of the metadata field to extract the XNAT imaging resource ID from " @@ -110,11 +110,11 @@ @click.option( "--associated-files", type=AssociatedFiles.cli_type, - nargs=2, + nargs=3, default=None, multiple=True, envvar="XNAT_INGEST_STAGE_ASSOCIATED", - metavar=" ", + metavar=" ", help=( 'The "glob" arg is a glob pattern by which to detect associated files to be ' "attached to the DICOM sessions. Note that when this pattern corresponds to a " @@ -122,7 +122,7 @@ "the DICOMs for the session NOT the current working directory Can contain string " "templates corresponding to DICOM metadata fields, which are substituted before " "the glob is called. For example, " - '"./associated/{PatientName.given_name}_{PatientName.family_name}/*)" ' + '"./associated/{PatientName.family_name}_{PatientName.given_name}/*)" ' "will find all files under the subdirectory within '/path/to/dicoms/associated' that matches " "_. Will be interpreted as being relative to `dicoms_dir` " "if a relative path is provided.\n" @@ -308,7 +308,7 @@ def stage( # Identify theDeidentify files if necessary and save them to the staging directory session.stage( staging_dir, - associated_files=associated_files, + associated_file_groups=associated_files, remove_original=delete, deidentify=deidentify, project_list=project_list, diff --git a/xnat_ingest/cli/upload.py b/xnat_ingest/cli/upload.py index dae28a2..7de9b4a 100644 --- a/xnat_ingest/cli/upload.py +++ b/xnat_ingest/cli/upload.py @@ -446,6 +446,7 @@ def iter_staged_sessions(): image_type = scan.metadata.get("ImageType") if image_type and image_type[:2] == ["DERIVED", "SECONDARY"]: modality = "SC" + resource_name = "secondary" else: modality = scan.metadata.get( "Modality", default_scan_modality diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index a40ae1b..e5ab40a 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -18,7 +18,6 @@ from tqdm import tqdm import yaml import pydicom -from fileformats.generic import File from fileformats.application import Dicom from fileformats.medimage import DicomSeries from fileformats.core import from_paths, FileSet, DataType, from_mime, to_mime @@ -43,9 +42,20 @@ def scan_type_converter(scan_type: str) -> str: @attrs.define class ImagingScan: + """Representation of a scan to be uploaded to XNAT + + Parameters + ---------- + id: str + the ID of the scan on XNAT + type: str + the scan type/description + """ + id: str type: str = attrs.field(converter=scan_type_converter) resources: ty.Dict[str, FileSet] = attrs.field() + associated: bool = False def __contains__(self, resource_name): return resource_name in self.resources @@ -115,13 +125,22 @@ def modalities(self) -> ty.Set[str]: return modalities @property - def dicoms(self): - return (scan["DICOM"] for scan in self.scans.values() if "DICOM" in scan) + def parent_dirs(self) -> ty.Set[Path]: + "Return parent directories for all resources in the session" + return set(r.parent for r in self.resources) @property - def dicom_dirs(self) -> ty.List[Path]: - "A common parent directory for all the top-level paths in the file-set" - return [p.parent for p in self.dicoms] # type: ignore + def resources(self) -> ty.List[FileSet]: + return [r for p in self.scans.values() for r in p.resources.values()] + + @property + def primary_resources(self) -> ty.List[FileSet]: + return [ + r + for s in self.scans.values() + for r in s.resources.values() + if not s.associated + ] def select_resources( self, @@ -199,13 +218,13 @@ def select_resources( @cached_property def metadata(self): - all_dicoms = list(self.dicoms) - all_keys = [list(d.metadata.keys()) for d in all_dicoms if d.metadata] + primary_resources = self.primary_resources + all_keys = [list(d.metadata.keys()) for d in primary_resources if d.metadata] common_keys = [ k for k in set(chain(*all_keys)) if all(k in keys for keys in all_keys) ] - collated = {k: all_dicoms[0].metadata[k] for k in common_keys} - for i, series in enumerate(all_dicoms[1:], start=1): + collated = {k: primary_resources[0].metadata[k] for k in common_keys} + for i, series in enumerate(primary_resources[1:], start=1): for key in common_keys: val = series.metadata[key] if val != collated[key]: @@ -232,7 +251,7 @@ def from_paths( visit_field: str = "AccessionNumber", scan_id_field: str = "SeriesNumber", scan_desc_field: str = "SeriesDescription", - resource_field: str = "ImageType", + resource_field: str = "ImageType[-1]", session_field: str | None = "StudyInstanceUID", project_id: str | None = None, ) -> ty.List[Self]: @@ -260,7 +279,7 @@ def from_paths( by default "SeriesDescription" resource_field: str the metadata field that contains the XNAT resource ID for the imaging session, - by default "ImageType" + by default "ImageType[-1]" session_field : str, optional the name of the metadata field that uniquely identifies the session, used to check that the values extracted from the IDs across the DICOM scans are @@ -331,8 +350,13 @@ def from_paths( session_uid = resource.metadata[session_field] if session_field else None def get_id(field_type: str, field_name: str) -> str: + if match := re.match(r"(\w+)\[([\-\d]+)\]", field_name): + field_name, index = match.groups() + index = int(index) + else: + index = None try: - value = resource.metadata[field_name] + value = str(resource.metadata[field_name]) except KeyError: if session_uid and field_type in ("project", "subject", "visit"): value = ( @@ -349,6 +373,8 @@ def get_id(field_type: str, field_name: str) -> str: f"Did not find '{field_name}' field in {resource}, " "cannot uniquely identify the resource" ) + if index is not None: + value = value[index] return value if not project_id: @@ -357,7 +383,10 @@ def get_id(field_type: str, field_name: str) -> str: visit_id = get_id("visit", visit_field) scan_id = get_id("scan", scan_id_field) scan_type = get_id("scan type", scan_desc_field) - resource_id = get_id("resource", resource_field) + if isinstance(resource, DicomSeries): + resource_id = "DICOM" + else: + resource_id = get_id("resource", resource_field) if session_uid is None: session_uid = (project_id, subject_id, visit_id) @@ -592,7 +621,7 @@ def save(self, save_dir: Path, just_manifest: bool = False) -> "ImagingSession": def stage( self, dest_dir: Path, - associated_files: ty.Optional[AssociatedFiles] = None, + associated_file_groups: ty.Collection[AssociatedFiles] = (), remove_original: bool = False, deidentify: bool = True, project_list: ty.Optional[ty.List[str]] = None, @@ -610,12 +639,12 @@ def stage( work_dir : Path, optional the directory the staged sessions are created in before they are moved into the staging directory - associated_files : ty.Tuple[str, str], optional + associated_file_groups : Collection[AssociatedFiles], optional Glob pattern used to select the non-dicom files to include in the session. Note that the pattern is relative to the parent directory containing the DICOM files NOT the current working directory. The glob pattern can contain string template placeholders corresponding to DICOM - metadata (e.g. '{PatientName.given_name}_{PatientName.family_name}'), which + metadata (e.g. '{PatientName.family_name}_{PatientName.given_name}'), which are substituted before the string is used to glob the non-DICOM files. In order to deidentify the filenames, the pattern must explicitly reference all identifiable fields in string template placeholders. By default, None @@ -653,6 +682,7 @@ def stage( project_dir = "INVALID-UNRECOGNISED-PROJECT-" + self.project_id session_dir = dest_dir / project_dir / self.subject_id / self.visit_id session_dir.mkdir(parents=True) + session_metadata = self.metadata for scan in tqdm( self.scans.values(), f"Staging DICOM sessions to {session_dir}" ): @@ -686,14 +716,14 @@ def stage( staged_scans.append( ImagingScan(id=scan.id, type=scan.type, resources=staged_resources) ) - if associated_files: + for associated_files in associated_file_groups: # substitute string templates int the glob template with values from the # DICOM metadata to construct a glob pattern to select files associated # with current session associated_fspaths: ty.Set[Path] = set() - for dicom_dir in self.dicom_dirs: + for parent_dir in self.parent_dirs: assoc_glob = str( - dicom_dir / associated_files.glob.format(**self.metadata) + parent_dir / associated_files.glob.format(**session_metadata) ) if spaces_to_underscores: assoc_glob = assoc_glob.replace(" ", "_") @@ -705,7 +735,7 @@ def stage( logger.info( "Found %s associated file paths matching '%s'", len(associated_fspaths), - assoc_glob, + associated_files.glob, ) tmpdir = session_dir / ".tmp" @@ -720,12 +750,12 @@ def stage( assoc_glob_pattern = associated_files.glob else: assoc_glob_pattern = ( - str(dicom_dir) + os.path.sep + associated_files.glob + str(parent_dir) + os.path.sep + associated_files.glob ) transformed_fspaths = transform_paths( list(associated_fspaths), assoc_glob_pattern, - self.metadata, + session_metadata, staged_metadata, spaces_to_underscores=spaces_to_underscores, ) @@ -804,13 +834,13 @@ def stage( else: shutil.copyfile(fspath, dest_path) resource_fspaths.append(dest_path) - format_type = File if len(fspaths) == 1 else FileSet - scan_resources[resource_name] = format_type(resource_fspaths) + scan_resources[resource_name] = associated_files.datatype(resource_fspaths) staged_scans.append( ImagingScan( id=scan_id, type=scan_type, resources=scan_resources, + associated=True, ) ) os.rmdir(tmpdir) # Should be empty diff --git a/xnat_ingest/tests/test_cli.py b/xnat_ingest/tests/test_cli.py index b71c491..88e0c6f 100644 --- a/xnat_ingest/tests/test_cli.py +++ b/xnat_ingest/tests/test_cli.py @@ -1,5 +1,6 @@ import os import shutil +from datetime import datetime from pathlib import Path from frametree.core.cli import ( # type: ignore[import-untyped] define as dataset_define, @@ -22,12 +23,10 @@ from medimages4tests.dummy.dicom.pet.statistics.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] get_image as get_statistics_image, ) -from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] - get_files as get_raw_data_files, -) +from conftest import get_raw_data_files -PATTERN = "{PatientName.given_name}_{PatientName.family_name}_{SeriesDate}.*" +PATTERN = "{PatientName.family_name}_{PatientName.given_name}_{SeriesDate}.*" def test_stage_and_upload( @@ -62,7 +61,6 @@ def test_stage_and_upload( for i, c in enumerate("abc"): first_name = f"First{c.upper()}" last_name = f"Last{c.upper()}" - PatientName = f"{first_name}^{last_name}" PatientID = f"subject{i}" AccessionNumber = f"98765432{i}" session_ids.append(f"{PatientID}_{AccessionNumber}") @@ -74,7 +72,8 @@ def test_stage_and_upload( series = DicomSeries( get_pet_image( tmp_gen_dir / f"pet{i}", - PatientName=PatientName, + first_name=first_name, + last_name=last_name, StudyInstanceUID=StudyInstanceUID, PatientID=PatientID, AccessionNumber=AccessionNumber, @@ -86,7 +85,8 @@ def test_stage_and_upload( series = DicomSeries( get_ac_image( tmp_gen_dir / f"ac{i}", - PatientName=PatientName, + first_name=first_name, + last_name=last_name, StudyInstanceUID=StudyInstanceUID, PatientID=PatientID, AccessionNumber=AccessionNumber, @@ -98,7 +98,8 @@ def test_stage_and_upload( series = DicomSeries( get_topogram_image( tmp_gen_dir / f"topogram{i}", - PatientName=PatientName, + first_name=first_name, + last_name=last_name, StudyInstanceUID=StudyInstanceUID, PatientID=PatientID, AccessionNumber=AccessionNumber, @@ -110,7 +111,8 @@ def test_stage_and_upload( series = DicomSeries( get_statistics_image( tmp_gen_dir / f"statistics{i}", - PatientName=PatientName, + first_name=first_name, + last_name=last_name, StudyInstanceUID=StudyInstanceUID, PatientID=PatientID, AccessionNumber=AccessionNumber, @@ -123,7 +125,7 @@ def test_stage_and_upload( tmp_gen_dir / f"non-dicom{i}", first_name=first_name, last_name=last_name, - date_time=f"2023.08.25.15.50.5{i}", + date_time=datetime(2023, 8, 25, 15, 50, 5, i), ) for assoc_fspath in assoc_fspaths: os.link( @@ -163,11 +165,11 @@ def test_stage_and_upload( "medimage/vnd.siemens.biograph128-vision.vr20b.pet-list-mode", ".*/LISTMODE", ), - ( - "sinogram", - "medimage/vnd.siemens.biograph128-vision.vr20b.pet-sinogram", - ".*/EM_SINO", - ), + # ( + # "sinogram", + # "medimage/vnd.siemens.biograph128-vision.vr20b.pet-sinogram", + # ".*/EM_SINO", + # ), ( "countrate", "medimage/vnd.siemens.biograph128-vision.vr20b.pet-count-rate", @@ -193,8 +195,9 @@ def test_stage_and_upload( str(dicoms_dir), str(staging_dir), "--associated-files", + "medimage/vnd.siemens.biograph128-vision.vr20b.pet-raw-data", str(associated_files_dir) - + "/{PatientName.given_name}_{PatientName.family_name}*.ptd", + + "/{PatientName.family_name}_{PatientName.given_name}*.ptd", r".*/[^\.]+.[^\.]+.[^\.]+.(?P\d+)\.[A-Z]+_(?P[^\.]+).*", "--log-file", str(log_file), @@ -249,5 +252,5 @@ def test_stage_and_upload( "4", "6", "602", - "603", + # "603", ] diff --git a/xnat_ingest/tests/test_session.py b/xnat_ingest/tests/test_session.py index b3b0fb4..6f5a9e2 100644 --- a/xnat_ingest/tests/test_session.py +++ b/xnat_ingest/tests/test_session.py @@ -27,12 +27,9 @@ get_image as get_statistics_image, __file__ as statistics_src_file, ) -from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b import ( # type: ignore[import-untyped] - get_files as get_raw_data_files, -) from xnat_ingest.session import ImagingSession, ImagingScan, DummyAxes from xnat_ingest.utils import AssociatedFiles - +from conftest import get_raw_data_files FIRST_NAME = "Given Name" LAST_NAME = "FamilyName" @@ -40,30 +37,33 @@ @pytest.fixture def imaging_session() -> ImagingSession: - PatientName = f"{FIRST_NAME}^{LAST_NAME}" default_dicom_dir dicoms = [ DicomSeries(d.iterdir()) for d in ( get_pet_image( out_dir=default_dicom_dir(pet_src_file).with_suffix(".with-spaces"), - PatientName=PatientName, + first_name=FIRST_NAME, + last_name=LAST_NAME, ), get_ac_image( out_dir=default_dicom_dir(ac_src_file).with_suffix(".with-spaces"), - PatientName=PatientName, + first_name=FIRST_NAME, + last_name=LAST_NAME, ), get_topogram_image( out_dir=default_dicom_dir(topogram_src_file).with_suffix( ".with-spaces" ), - PatientName=PatientName, + first_name=FIRST_NAME, + last_name=LAST_NAME, ), get_statistics_image( out_dir=default_dicom_dir(statistics_src_file).with_suffix( ".with-spaces" ), - PatientName=PatientName, + first_name=FIRST_NAME, + last_name=LAST_NAME, ), ) ] @@ -120,11 +120,11 @@ def dataset(tmp_path: Path) -> FrameSet: "medimage/vnd.siemens.biograph128-vision.vr20b.pet-list-mode", ".*/LISTMODE", ), - ( - "sinogram", - "medimage/vnd.siemens.biograph128-vision.vr20b.pet-sinogram", - ".*/EM_SINO", - ), + # ( + # "sinogram", + # "medimage/vnd.siemens.biograph128-vision.vr20b.pet-sinogram", + # ".*/EM_SINO", + # ), ( "countrate", "medimage/vnd.siemens.biograph128-vision.vr20b.pet-count-rate", @@ -135,9 +135,9 @@ def dataset(tmp_path: Path) -> FrameSet: return dataset -@pytest.mark.xfail( - condition=platform.system() == "Linux", reason="Not working on ubuntu" -) +# @pytest.mark.xfail( +# condition=platform.system() == "Linux", reason="Not working on ubuntu" +# ) def test_session_select_resources( imaging_session: ImagingSession, dataset: FrameSet, tmp_path: Path ): @@ -145,44 +145,46 @@ def test_session_select_resources( assoc_dir = tmp_path / "assoc" assoc_dir.mkdir() - for fspath in get_raw_data_files( - first_name=FIRST_NAME.replace(" ", "_"), last_name=LAST_NAME - ): - fspath.rename(assoc_dir / fspath.name) + get_raw_data_files( + out_dir=assoc_dir, first_name=FIRST_NAME.replace(" ", "_"), last_name=LAST_NAME + ) staging_dir = tmp_path / "staging" staging_dir.mkdir() staged_session = imaging_session.stage( staging_dir, - associated_files=AssociatedFiles( - str(assoc_dir) + "/{PatientName.given_name}_{PatientName.family_name}*.ptd", - r".*/[^\.]+.[^\.]+.[^\.]+.(?P\d+)\.[A-Z]+_(?P[^\.]+).*", - ), + associated_file_groups=[ + AssociatedFiles( + str(assoc_dir) + + "/{PatientName.family_name}_{PatientName.given_name}*.ptd", + r".*/[^\.]+.[^\.]+.[^\.]+.(?P\d+)\.[A-Z]+_(?P[^\.]+).*", + ) + ], spaces_to_underscores=True, ) resources = list(staged_session.select_resources(dataset)) - assert len(resources) == 6 + assert len(resources) == 5 # 6 ids, descs, resource_names, scans = zip(*resources) - assert set(ids) == set(("1", "2", "4", "602", "603")) + assert set(ids) == set(("1", "2", "4", "602")) # , "603")) assert set(descs) == set( [ "AC CT 30 SWB HD_FoV", "PET SWB 8MIN", "Topogram 06 Tr60", "602", - "603", + # "603", ] ) - assert set(resource_names) == set(("DICOM", "LISTMODE", "COUNTRATE", "EM_SINO")) + assert set(resource_names) == set(("DICOM", "LISTMODE", "COUNTRATE")) # , "EM_SINO" assert set(type(s) for s in scans) == set( [ DicomSeries, Vnd_Siemens_Biograph128Vision_Vr20b_PetListMode, Vnd_Siemens_Biograph128Vision_Vr20b_PetCountRate, - Vnd_Siemens_Biograph128Vision_Vr20b_PetSinogram, + # Vnd_Siemens_Biograph128Vision_Vr20b_PetSinogram, ] ) diff --git a/xnat_ingest/utils.py b/xnat_ingest/utils.py index a4bc749..c7d3009 100644 --- a/xnat_ingest/utils.py +++ b/xnat_ingest/utils.py @@ -8,13 +8,21 @@ import hashlib import attrs import click.types -from fileformats.core import FileSet +from fileformats.core import DataType, FileSet, from_mime from .dicom import DicomField # noqa logger = logging.getLogger("xnat-ingest") +def datatype_converter( + datatype_str: ty.Union[str, ty.Type[DataType]] +) -> ty.Type[DataType]: + if isinstance(datatype_str, str): + return from_mime(datatype_str) + return datatype_str + + class classproperty(object): def __init__(self, f): self.f = f @@ -110,8 +118,9 @@ class MailServer(CliTyped): @attrs.define class AssociatedFiles(CliTyped): - glob: str - identity_pattern: str + datatype: ty.Type[FileSet] = attrs.field(converter=datatype_converter) + glob: str = attrs.field() + identity_pattern: str = attrs.field() @attrs.define @@ -136,6 +145,7 @@ def set_logger_handling( mail_server: MailServer, add_logger: ty.Sequence[str] = (), ): + loggers = [logger] for log in add_logger: loggers.append(logging.getLogger(log)) @@ -173,16 +183,19 @@ def set_logger_handling( logr.addHandler(smtp_hdle) # Configure the file logger - for log_file in log_files: - log_file.path.parent.mkdir(exist_ok=True) - log_file_hdle = logging.FileHandler(log_file) - if log_file.loglevel: - log_file_hdle.setLevel(getattr(logging, log_file.loglevel.upper())) - log_file_hdle.setFormatter( - logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") - ) - for logr in loggers: - logr.addHandler(log_file_hdle) + if log_files: + for log_file in log_files: + log_file.path.parent.mkdir(exist_ok=True) + log_file_hdle = logging.FileHandler(log_file) + if log_file.loglevel: + log_file_hdle.setLevel(getattr(logging, log_file.loglevel.upper())) + log_file_hdle.setFormatter( + logging.Formatter( + "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + ) + ) + for logr in loggers: + logr.addHandler(log_file_hdle) console_hdle = logging.StreamHandler(sys.stdout) console_hdle.setLevel(getattr(logging, log_level.upper())) @@ -397,7 +410,8 @@ def replace_named_groups(match): stripped_fspath = Path(part) else: stripped_fspath /= part - new_fspath = stripped_fspath + assert stripped_fspath is not None + new_fspath = str(stripped_fspath) # Use re.sub() with the custom replacement function transformed.append(Path(new_fspath)) return transformed From 39b0c15535198d5d6079dc3dbbab57721aa1b2e9 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 24 Sep 2024 17:45:54 +1000 Subject: [PATCH 6/7] fixing up unittests --- xnat_ingest/session.py | 4 +++- xnat_ingest/tests/test_dicom.py | 2 +- xnat_ingest/tests/test_session.py | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py index e5ab40a..118a561 100644 --- a/xnat_ingest/session.py +++ b/xnat_ingest/session.py @@ -834,7 +834,9 @@ def stage( else: shutil.copyfile(fspath, dest_path) resource_fspaths.append(dest_path) - scan_resources[resource_name] = associated_files.datatype(resource_fspaths) + scan_resources[resource_name] = associated_files.datatype( + resource_fspaths + ) staged_scans.append( ImagingScan( id=scan_id, diff --git a/xnat_ingest/tests/test_dicom.py b/xnat_ingest/tests/test_dicom.py index 703b088..69a4f72 100644 --- a/xnat_ingest/tests/test_dicom.py +++ b/xnat_ingest/tests/test_dicom.py @@ -29,7 +29,7 @@ def test_mrtrix_dicom_metadata(dicom_series: DicomSeries): ] dicom_series = DicomSeries(dicom_series, specific_tags=keys) - assert sorted(dicom_series.metadata) == sorted(keys + ["SpecificCharacterSet"]) + assert not (set(keys + ["SpecificCharacterSet"]) - set(dicom_series.metadata)) assert dicom_series.metadata["PatientName"] == "GivenName^FamilyName" assert dicom_series.metadata["AccessionNumber"] == "987654321" assert dicom_series.metadata["PatientID"] == "Session Label" diff --git a/xnat_ingest/tests/test_session.py b/xnat_ingest/tests/test_session.py index 6f5a9e2..6d1f9ba 100644 --- a/xnat_ingest/tests/test_session.py +++ b/xnat_ingest/tests/test_session.py @@ -4,6 +4,7 @@ from fileformats.core import from_mime, FileSet from fileformats.medimage import ( DicomSeries, + Vnd_Siemens_Biograph128Vision_Vr20b_PetRawData, Vnd_Siemens_Biograph128Vision_Vr20b_PetCountRate, Vnd_Siemens_Biograph128Vision_Vr20b_PetListMode, Vnd_Siemens_Biograph128Vision_Vr20b_PetSinogram, @@ -156,6 +157,7 @@ def test_session_select_resources( staging_dir, associated_file_groups=[ AssociatedFiles( + Vnd_Siemens_Biograph128Vision_Vr20b_PetRawData, str(assoc_dir) + "/{PatientName.family_name}_{PatientName.given_name}*.ptd", r".*/[^\.]+.[^\.]+.[^\.]+.(?P\d+)\.[A-Z]+_(?P[^\.]+).*", From f1bea511109dfd724679b991aa3afd6e512c34af Mon Sep 17 00:00:00 2001 From: Tom Close Date: Tue, 24 Sep 2024 18:08:15 +1000 Subject: [PATCH 7/7] deleted impotent line --- xnat_ingest/tests/test_session.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/xnat_ingest/tests/test_session.py b/xnat_ingest/tests/test_session.py index 6d1f9ba..8bb285a 100644 --- a/xnat_ingest/tests/test_session.py +++ b/xnat_ingest/tests/test_session.py @@ -1,13 +1,11 @@ from pathlib import Path import pytest -import platform from fileformats.core import from_mime, FileSet from fileformats.medimage import ( DicomSeries, Vnd_Siemens_Biograph128Vision_Vr20b_PetRawData, Vnd_Siemens_Biograph128Vision_Vr20b_PetCountRate, Vnd_Siemens_Biograph128Vision_Vr20b_PetListMode, - Vnd_Siemens_Biograph128Vision_Vr20b_PetSinogram, ) from frametree.core.frameset import FrameSet # type: ignore[import-untyped] from frametree.common import FileSystem # type: ignore[import-untyped] @@ -38,7 +36,6 @@ @pytest.fixture def imaging_session() -> ImagingSession: - default_dicom_dir dicoms = [ DicomSeries(d.iterdir()) for d in (