Skip to content

Commit

Permalink
moved associated files identification into stage tool instead of upload
Browse files Browse the repository at this point in the history
  • Loading branch information
tclose committed Nov 20, 2023
1 parent 0027b26 commit 42ee199
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 20 deletions.
25 changes: 19 additions & 6 deletions xnat_ingest/cli/stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,20 +57,31 @@
help=("Override the project ID read from the DICOM headers"),
)
@click.option(
"--assoc-files-glob",
"--associated",
type=str,
default=None,
envvar="XNAT_INGEST_NONDICOMSPATTERN",
help=(
"Glob pattern by which to detect non-DICOM files that "
"corresponding to DICOM sessions. Can contain string templates corresponding to "
"Glob pattern by which to detect associated files to be attached to the DICOM "
"sessions. Can contain string templates corresponding to "
"DICOM metadata fields, which are substituted before the glob is called. For "
'example, "/path/to/non-dicoms/{PatientName.given_name}_{PatientName.family_name}/*)" '
"will find all files under the subdirectory within '/path/to/non-dicoms/' that matches "
"<GIVEN-NAME>_<FAMILY-NAME>. Will be interpreted as being relative to `dicoms_dir` "
"if a relative path is provided."
),
)
@click.option(
"--assoc-identification",
type=str,
default=None,
envvar="XNAT_INGEST_ASSOCIDENTIFICATION",
help=(
"Used to extract the scan ID & type/resource from the associated filename. Should "
"be a regular-expression (Python syntax) with named groups called 'id' and 'type', e.g. "
r"--assoc-id-pattern '[^\.]+\.[^\.]+\.(?P<id>\d+)\.(?P<type>\w+)\..*'"
),
)
@click.option(
"--delete/--dont-delete",
default=False,
Expand Down Expand Up @@ -129,7 +140,8 @@
def stage(
dicoms_path: str,
staging_dir: Path,
assoc_files_glob: str,
associated: str,
assoc_identification: str,
project_field: str,
subject_field: str,
session_field: str,
Expand All @@ -147,12 +159,13 @@ def stage(
logger.info(
"Loading DICOM sessions from '%s' and associated files from '%s'",
str(dicoms_path),
str(assoc_files_glob),
str(associated),
)

sessions = ImagingSession.construct(
dicoms_path=dicoms_path,
associated_files_pattern=assoc_files_glob,
associated_files_pattern=associated,
assoc_files_identification=assoc_identification,
project_field=project_field,
subject_field=subject_field,
session_field=session_field,
Expand Down
31 changes: 25 additions & 6 deletions xnat_ingest/cli/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,22 @@
),
)
@click.option(
"--include-dicoms/--exclude-dicoms",
"--all-dicoms/--not-all-dicoms",
default=False,
type=bool,
envvar="XNAT_INGEST_EXCLUDEDICOM",
envvar="XNAT_INGEST_ALLDICOMS",
help=(
"Whether to exclude DICOM scans in upload regardless of whether they are "
"Whether to include all DICOM scans in the upload regardless of whether they are "
"specified in a column or not"
),
)
@click.option(
"--all-assoc/--not-all-assoc",
default=False,
type=bool,
envvar="XNAT_INGEST_ALLASSOC",
help=(
"Whether to include all associated files in the upload regardless of whether they are "
"specified in a column or not"
),
)
Expand All @@ -114,9 +124,16 @@ def upload(
log_file: Path,
log_emails: LogEmail,
mail_server: MailServer,
include_dicoms: bool,
all_dicoms: bool,
all_assoc: bool,
raise_errors: bool,
):

if all_assoc:
raise NotImplementedError(
"--all-assoc option hasn't been implemented yet"
)

set_logger_handling(log_level, log_file, log_emails, mail_server)

xnat_repo = Xnat(
Expand Down Expand Up @@ -183,7 +200,7 @@ def upload(
raise e

# Anonymise DICOMs and save to directory prior to upload
if include_dicoms:
if all_dicoms:
logger.info(
f"Including all DICOMS in upload from '{session.name}' to "
f"{session_path} as `--include-dicoms` is set"
Expand All @@ -196,7 +213,9 @@ def upload(

for scan_id, scan_type, resource_name, scan in tqdm(
session.select_resources(
dataset, include_all_dicoms=include_dicoms
dataset,
include_all_dicoms=all_dicoms,
include_all_assoc=all_assoc,
),
f"Uploading scans found in {session.name}",
):
Expand Down
37 changes: 29 additions & 8 deletions xnat_ingest/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ def select_resources(
self,
dataset: Dataset,
include_all_dicoms: bool = False,
include_all_assoc: bool = False,
assoc_id_pattern: str = None
) -> ty.Iterator[ty.Tuple[str, str, str, FileSet]]:
"""Returns selected resources that match the columns in the dataset definition
Expand All @@ -93,7 +95,7 @@ def select_resources(
scan : FileSet
a fileset to upload
"""
store = MockDataStore(self)
store = MockDataStore(self, assoc_id_pattern=assoc_id_pattern)

uploaded: ty.Set[FileSet] = set()

Expand Down Expand Up @@ -133,7 +135,6 @@ def select_resources(
elif isinstance(scan, Dicom):
scan_id = scan["Series"]
else:
scan_id = column.name
if column.is_regex and re.compile(column.path).groups:
pattern = column.path
else:
Expand All @@ -147,7 +148,12 @@ def select_resources(
match = re.match(pattern, scan_type)
if not match:
raise RuntimeError(f"{pattern} did not match {scan_type}")
scan_type = match.group(1)
if len(match.groups()) == 1:
scan_type = match.group(1)
scan_id = column.name
else:
scan_type = match.group("type")
scan_id = match.group("id")
uploaded.add(scan)
yield scan_id, scan_type, resource_name, scan

Expand Down Expand Up @@ -179,6 +185,7 @@ def construct(
cls,
dicoms_path: str | Path,
associated_files_pattern: str | None = None,
assoc_files_identification: str | None = None,
project_field: str = "StudyID",
subject_field: str = "PatientID",
session_field: str = "AccessionNumber",
Expand All @@ -191,13 +198,17 @@ def construct(
dicoms_path : str or Path
Path to a directory containging the DICOMS to load the sessions from, or a
glob string that selects the paths
associated_files_pattern : str
associated_files_pattern : str, optional
Pattern used to select the non-dicom files to include in the session. The
pattern can contain string template placeholders corresponding to DICOM
metadata (e.g. '{PatientName.given_name}_{PatientName.family_name}'), which
are substituted before the string is used to glob the non-DICOM files. In
order to deidentify the filenames, the pattern must explicitly reference all
identifiable fields in string template placeholders.
assoc_files_identification : str, optional
Used to extract the scan ID & type/resource from the associated filename. Should
be a regular-expression (Python syntax) with named groups called 'id' and 'type', e.g.
'[^\.]+\.[^\.]+\.(?P<id>\d+)\.(?P<type>\w+)\..*'
project_field : str
the name of the DICOM field that is to be interpreted as the corresponding
XNAT project
Expand Down Expand Up @@ -227,6 +238,9 @@ def construct(
else:
dicom_fspaths = [Path(p) for p in glob(dicoms_path)]

if assoc_files_identification:
raise NotImplementedError

# Sort loaded series by StudyInstanceUID (imaging session)
logger.info("Loading DICOM series from %s", str(dicoms_path))
dicom_sessions = defaultdict(list)
Expand Down Expand Up @@ -535,6 +549,7 @@ class MockDataStore(DataStore):
"""

session: ImagingSession
assoc_id_pattern: str

@property
def row(self):
Expand Down Expand Up @@ -564,17 +579,23 @@ def populate_row(self, row: DataRow):
row : DataRow
The row to populate with entries
"""
series_numbers = []
for series_number, dcm in self.session.dicoms.items():
row.add_entry(
path=dcm["SeriesDescription"],
datatype=DicomSeries,
uri=f"dicom::{dcm['SeriesNumber']}",
uri=f"dicom::{series_number}",
)
for non_dcm_fspath in self.session.associated_file_fspaths:
series_numbers.append(series_number)

collated = defaultdict(list)
for assoc_fspath in self.session.associated_file_fspaths:

for resource in collated:
row.add_entry(
path=non_dcm_fspath.name,
path=assoc_fspath.name,
datatype=FileSet,
uri=f"associated_file::{non_dcm_fspath}",
uri=f"associated_file::{assoc_fspath}",
)

def get(self, entry: DataEntry, datatype: type) -> DataType:
Expand Down

0 comments on commit 42ee199

Please sign in to comment.