Merge pull request #22 from Australian-Imaging-Service/datatype-handling

Debugged datatype, logging and missing ID handling
Australian-Imaging-Service · Oct 2, 2024 · e0d753a · e0d753a
2 parents 68ddb1a + 9bd2c84
commit e0d753a
Show file tree

Hide file tree

Showing 7 changed files with 100 additions and 63 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,28 +1,28 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
-    hooks:
-    -   id: trailing-whitespace
-    -   id: end-of-file-fixer
-    -   id: check-yaml
-    -   id: check-added-large-files
--   repo: https://github.com/psf/black
-    rev: 22.3.0
-    hooks:
-    -   id: black
-        exclude: ^(arcana/_version\.py|versioneer\.py)$
-        args:
-          - -l 88
--   repo: https://github.com/codespell-project/codespell
-    rev: v2.1.0
-    hooks:
-    -   id: codespell
-        exclude: ^(xnat_checks/_version\.py|versioneer\.py)$
-        args:
-        -   --ignore-words=.codespell-ignorewords
--   repo: https://github.com/PyCQA/flake8
-    rev: 4.0.1
-    hooks:
-    -   id: flake8
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v4.3.0
+      hooks:
+          - id: trailing-whitespace
+          - id: end-of-file-fixer
+          - id: check-yaml
+          - id: check-added-large-files
+    - repo: https://github.com/psf/black
+      rev: 22.3.0
+      hooks:
+          - id: black
+            exclude: ^(arcana/_version\.py|versioneer\.py)$
+            args:
+                - -l 88
+    - repo: https://github.com/codespell-project/codespell
+      rev: v2.1.0
+      hooks:
+          - id: codespell
+            exclude: ^(xnat_checks/_version\.py|versioneer\.py)$
+            args:
+                - --ignore-words=.codespell-ignorewords
+    - repo: https://github.com/PyCQA/flake8
+      rev: 7.0.0
+      hooks:
+          - id: flake8
diff --git a/scripts/get_pet_tst.py b/scripts/get_pet_tst.py
@@ -0,0 +1,23 @@
+import tempfile
+from pathlib import Path
+from fileformats.medimage import DicomSeries
+from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import (  # type: ignore[import-untyped]
+    get_image as get_pet_image,
+)
+
+
+tmp_path = Path(tempfile.mkdtemp())
+
+series = DicomSeries(
+    get_pet_image(
+        tmp_path,
+        first_name="first",
+        last_name="last",
+        StudyInstanceUID="StudyInstanceUID",
+        PatientID="PatientID",
+        AccessionNumber="AccessionNumber",
+        StudyID="xnat_project",
+    ).iterdir()
+)
+
+print(series.metadata["StudyID"])
diff --git a/xnat_ingest/cli/stage.py b/xnat_ingest/cli/stage.py
@@ -7,6 +7,7 @@
 import tempfile
 from tqdm import tqdm
 from fileformats.core import FileSet
+from fileformats.medimage import DicomSeries
 from xnat_ingest.cli.base import cli
 from xnat_ingest.session import ImagingSession
 from frametree.xnat import Xnat  # type: ignore[import-untyped]
@@ -25,13 +26,13 @@
 
 
 @cli.command(
-    help="""Stages DICOM and associated files found in the input directories into separate
-directories for each session
+    help="""Stages images found in the input directories into separate directories for each
+imaging acquisition session
 
-DICOMS_PATH is either the path to a directory containing the DICOM files to upload, or
-a glob pattern that selects the DICOM paths directly
+FILES_PATH is either the path to a directory containing the files to upload, or
+a glob pattern that selects the paths directly
 
-STAGING_DIR is the directory that the files for each session are collated to before they
+OUTPUT_DIR is the directory that the files for each session are collated to before they
 are uploaded to XNAT
 """,
 )
@@ -42,9 +43,15 @@
     type=str,
     metavar="<mime-type>",
     multiple=True,
-    default=["medimage/dicom-series"],
-    envvar="XINGEST_DATATYPE",
-    help="The datatype of the primary files to to upload",
+    default=None,
+    envvar="XINGEST_DATATYPES",
+    help=(
+        'The MIME-type(s) (or "MIME-like" see FileFormats docs) of potential datatype(s) '
+        "of the primary files to to upload, defaults to 'medimage/dicom-series'. "
+        "Any formats implemented in the FileFormats Python package "
+        "(https://github.com/ArcanaFramework/fileformats) that implement the 'read_metadata' "
+        '"extra" are supported, see FF docs on how to add support for new formats.'
+    ),
 )
 @click.option(
     "--project-field",
@@ -250,7 +257,7 @@
 def stage(
     files_path: str,
     output_dir: Path,
-    datatype: str,
+    datatype: list[str] | None,
     associated_files: ty.List[AssociatedFiles],
     project_field: str,
     subject_field: str,
@@ -279,6 +286,11 @@ def stage(
         logger_configs=loggers,
         additional_loggers=additional_loggers,
     )
+    datatypes: list[ty.Type[FileSet]]
+    if not datatype:
+        datatypes = [DicomSeries]
+    else:
+        datatypes = [FileSet.from_mime(dt) for dt in datatype]  # type: ignore[misc]
 
     if xnat_login:
         xnat_repo = Xnat(
@@ -292,10 +304,10 @@ def stage(
     else:
         project_list = None
 
-    if session_field is None and datatype == "medimage/dicom-series":
+    if session_field is None and DicomSeries in datatypes:
         session_field = "StudyInstanceUID"
 
-    msg = f"Loading {datatype} sessions from '{files_path}'"
+    msg = f"Loading {list(datatypes)} sessions from '{files_path}'"
 
     for assoc_files in associated_files:
         msg += f" with associated files selected from '{assoc_files.glob}'"
@@ -319,6 +331,7 @@ def stage(
     def do_stage() -> None:
         sessions = ImagingSession.from_paths(
             files_path=files_path,
+            datatypes=datatypes,
             project_field=project_field,
             subject_field=subject_field,
             visit_field=visit_field,

diff --git a/xnat_ingest/resource.py b/xnat_ingest/resource.py
@@ -27,7 +27,9 @@ class ImagingResource:
 
     @checksums.default
     def calculate_checksums(self) -> dict[str, str]:
-        return self.fileset.hash_files(crypto=hashlib.md5)
+        return self.fileset.hash_files(
+            crypto=hashlib.md5, relative_to=self.fileset.parent
+        )
 
     @property
     def datatype(self) -> ty.Type[FileSet]:

diff --git a/xnat_ingest/session.py b/xnat_ingest/session.py
@@ -323,6 +323,7 @@ def from_paths(
         multiple_sessions: ty.DefaultDict[str, ty.Set[ty.Tuple[str, str, str]]] = (
             defaultdict(set)
         )
+        missing_ids: dict[str, dict[str, str]] = defaultdict(dict)
         for resource in tqdm(
             resources,
             "Sorting resources into XNAT tree structure...",
@@ -338,21 +339,28 @@ def get_id(field_type: str, field_name: str) -> str:
                 try:
                     value = resource.metadata[field_name]
                 except KeyError:
+                    value = ""
+                if not value:
                     if session_uid and field_type in ("project", "subject", "visit"):
-                        value = (
-                            "INVALID_MISSING_"
-                            + field_type.upper()
-                            + "_"
-                            + "".join(
-                                random.choices(
-                                    string.ascii_letters + string.digits, k=8
+                        try:
+                            value = missing_ids[session_uid][field_type]
+                        except KeyError:
+                            value = missing_ids[session_uid][field_type] = (
+                                "INVALID_MISSING_"
+                                + field_type.upper()
+                                + "_"
+                                + "".join(
+                                    random.choices(
+                                        string.ascii_letters + string.digits, k=8
+                                    )
                                 )
                             )
+                    else:
+                        raise ImagingSessionParseError(
+                            f"Did not find '{field_name}' field in {resource!r}, "
+                            "cannot uniquely identify the resource, found:\n"
+                            + "\n".join(resource.metadata)
                         )
-                    raise ImagingSessionParseError(
-                        f"Did not find '{field_name}' field in {resource}, "
-                        "cannot uniquely identify the resource"
-                    )
                 if index is not None:
                     value = value[index]
                 value_str = str(value)
@@ -399,7 +407,8 @@ def get_id(field_type: str, field_name: str) -> str:
             raise ImagingSessionParseError(
                 "Multiple session UIDs found with the same project/subject/visit ID triplets: "
                 + "\n".join(
-                    f"{i} -> {p}:{s}:{v}" for i, (p, s, v) in multiple_sessions.items()
+                    f"{i} -> " + str(["{p}:{s}:{v}" for p, s, v in sess])
+                    for i, sess in multiple_sessions.items()
                 )
             )
         return list(sessions.values())

diff --git a/xnat_ingest/tests/test_session.py b/xnat_ingest/tests/test_session.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 import pytest
 import typing as ty
-from fileformats.core import from_mime, FileSet
+from fileformats.core import from_mime
 from fileformats.medimage import (
     DicomSeries,
     Vnd_Siemens_Biograph128Vision_Vr20b_PetRawData,
@@ -10,22 +10,17 @@
 )
 from frametree.core.frameset import FrameSet  # type: ignore[import-untyped]
 from frametree.common import FileSystem  # type: ignore[import-untyped]
-from medimages4tests.dummy.dicom.base import default_dicom_dir  # type: ignore[import-untyped]
 from medimages4tests.dummy.dicom.pet.wholebody.siemens.biograph_vision.vr20b import (  # type: ignore[import-untyped]
     get_image as get_pet_image,
-    __file__ as pet_src_file,
 )
 from medimages4tests.dummy.dicom.ct.ac.siemens.biograph_vision.vr20b import (  # type: ignore[import-untyped]
     get_image as get_ac_image,
-    __file__ as ac_src_file,
 )
 from medimages4tests.dummy.dicom.pet.topogram.siemens.biograph_vision.vr20b import (  # type: ignore[import-untyped]
     get_image as get_topogram_image,
-    __file__ as topogram_src_file,
 )
 from medimages4tests.dummy.dicom.pet.statistics.siemens.biograph_vision.vr20b import (  # type: ignore[import-untyped]
     get_image as get_statistics_image,
-    __file__ as statistics_src_file,
 )
 from xnat_ingest.session import ImagingSession, ImagingScan
 from xnat_ingest.store import DummyAxes
@@ -66,26 +61,18 @@ def imaging_session() -> ImagingSession:
         DicomSeries(d.iterdir())
         for d in (
             get_pet_image(
-                out_dir=default_dicom_dir(pet_src_file).with_suffix(".with-spaces"),
                 first_name=FIRST_NAME,
                 last_name=LAST_NAME,
             ),
             get_ac_image(
-                out_dir=default_dicom_dir(ac_src_file).with_suffix(".with-spaces"),
                 first_name=FIRST_NAME,
                 last_name=LAST_NAME,
             ),
             get_topogram_image(
-                out_dir=default_dicom_dir(topogram_src_file).with_suffix(
-                    ".with-spaces"
-                ),
                 first_name=FIRST_NAME,
                 last_name=LAST_NAME,
             ),
             get_statistics_image(
-                out_dir=default_dicom_dir(statistics_src_file).with_suffix(
-                    ".with-spaces"
-                ),
                 first_name=FIRST_NAME,
                 last_name=LAST_NAME,
             ),

diff --git a/xnat_ingest/utils.py b/xnat_ingest/utils.py
@@ -126,6 +126,9 @@ def set_logger_handling(
 ) -> None:
     """Set up logging for the application"""
 
+    if not logger_configs:
+        logger_configs = [LoggerConfig("stream", "info", "stdout")]
+
     loggers = [logger]
     for log in additional_loggers:
         loggers.append(logging.getLogger(log))