diff --git a/.gitignore b/.gitignore index 7cd7d28..044b97b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ __pycache__ logs predict +predict.* *.h5 *.keras *.hdf5 diff --git a/dataset/borders.py b/dataset/borders.py new file mode 100644 index 0000000..109af2a --- /dev/null +++ b/dataset/borders.py @@ -0,0 +1,77 @@ +import tensorflow as tf +import time +import os +import sys +import inspect + +currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) +parentdir = os.path.dirname(currentdir) +sys.path.insert(0, parentdir) + +from tools import resize_3d +import config as config + + +DEBUG_DATALOADER = True +DEBUG_DATA_LOADING_PERFORMANCE = False + +# cutout and resize 3-d tensor with shape [w,h,d] +# 1) cut overheads off from top_left to bottom_right + 1 +# top_left and bottom_right will be present in the final shape +# 2) resize shape from step(1) to final shape +# final shape taken form the config +def cutout_and_resize_tensor(tensor, top_left, bottom_right): + # assert tensor.ndim == 3 + t = tensor[top_left[0]:bottom_right[0] + 1, top_left[1]:bottom_right[1] + 1, top_left[2]:bottom_right[2] + 1] + t = tf.squeeze(t) + # assert tf.rank(t) == 3 + final_shape = tf.constant([config.IMAGE_DIMENSION_X, config.IMAGE_DIMENSION_Y, config.IMAGE_DIMENSION_Z]) + t = resize_3d.resize_3d_image(t, final_shape) + return t + + +# cutout and resize 3-d tensor with shape [w,h,d] +# cut overheads off from top_left to bottom_right + 1 +# percentages are used to calculate from top_left and bottom_right +# +# for example, if top_left = [0,0,0] and bottom_right = [100,100,100] +# pancreas volume from [10,10,10] to [90,90,90] +# if all percentages 0.1 (which is 10%) then +# the cut will be from [1,1,1] to [99,99,99] +def cut_and_resize_including_pancreas(data, label, top_left_percentage, bottom_right_percentage): + + start_prep = time.time() + top_left_label_position = tf.reduce_min(tf.where(label == 1), axis=0) + bottom_right_label_position = tf.reduce_max(tf.where(label == 1), axis=0) + # random_offset_top_left = tf.random.uniform(shape = [3], minval = [0.0, 0.0, 0.0], maxval = tf.cast(top_left_label_position, dtype=tf.float32)) + top_left_offset = top_left_percentage * tf.cast(top_left_label_position, dtype=tf.float32) + top_left_offset = tf.cast(top_left_offset, dtype = tf.int32) + # random_offset_bottom_right = tf.random.uniform(shape = [3], minval = tf.cast(bottom_right_label_position, dtype=tf.float32), maxval = tf.cast(tf.shape(data), dtype=tf.float32)) + bottom_right_offset = tf.cast(tf.shape(data), dtype=tf.float32) - bottom_right_percentage * tf.cast(tf.shape(data) - tf.cast(bottom_right_label_position, dtype=tf.int32), dtype=tf.float32) + bottom_right_offset = tf.cast(bottom_right_offset, dtype = tf.int32) + finish_prep = time.time() + + if DEBUG_DATALOADER: + print("\tpancreas shape:", (bottom_right_label_position - top_left_label_position).numpy()) + print("\ttop_left_label_position:", top_left_label_position.numpy(), "bottom_right_label_position:", bottom_right_label_position.numpy()) + print(f"\toriginal shape: {label.shape}") + print(f"\tpercentages: top_left: {top_left_percentage:.2f}, bottom_right: {bottom_right_percentage:.2f}") + print("\toffset_top_left:", top_left_offset.numpy(), "bottom_right_offset:", bottom_right_offset.numpy()) + print("\tslice shape:", (bottom_right_offset - top_left_offset + 1).numpy()) + + start_data = time.time() + _data = cutout_and_resize_tensor(data, top_left_offset, bottom_right_offset) + finish_data = time.time() + + start_label = time.time() + _label = cutout_and_resize_tensor(label, top_left_offset, bottom_right_offset) + finish_label = time.time() + + if DEBUG_DATA_LOADING_PERFORMANCE: + print(f"\tDATA_LOADING_PERFORMANCE: prep: {finish_prep - start_prep:.1f} data: {finish_data - start_data:.1f} label: {finish_label - start_label:.1f}") + + if DEBUG_DATALOADER: + print("\t_data shape:", _data.shape, "_label shape:", _label.shape) + + return _data, _label + diff --git a/dataset/craft_datasets.py b/dataset/craft_datasets.py index 443e230..4685c57 100644 --- a/dataset/craft_datasets.py +++ b/dataset/craft_datasets.py @@ -2,6 +2,7 @@ import time import tensorflow as tf import numpy as np +import borders import os import sys import inspect @@ -15,7 +16,7 @@ DEBUG_DATALOADER = False -DEBUG_DATA_LOADING_PERFORMANCE = False +DEBUG_DATA_LOADING_PERFORMANCE = True def fname_to_patientid(fname_src:str): if DEBUG_DATALOADER: @@ -47,54 +48,6 @@ def read_data_and_label(patient_id:str, src_folder:str): return data_array, label_array -# cutout and resize 3-d tensor with shape [w,h,d] -# 1) cut overheads off from top_left to bottom_right + 1 -# top_left and bottom_right will be present in the final shape -# 2) resize shape from step(1) to final shape -# final shape taken form the config -def cutout_and_resize_tensor(tensor, top_left, bottom_right): - # assert tensor.ndim == 3 - t = tensor[top_left[0]:bottom_right[0] + 1, top_left[1]:bottom_right[1] + 1, top_left[2]:bottom_right[2] + 1] - t = tf.squeeze(t) - # assert tf.rank(t) == 3 - final_shape = tf.constant([config.IMAGE_DIMENSION_X, config.IMAGE_DIMENSION_Y, config.IMAGE_DIMENSION_Z]) - t = resize_3d.resize_3d_image(t, final_shape) - return t - - -def random_slice_including_pancreas(data, label): - - start_prep = time.time() - top_left_label_position = tf.reduce_min(tf.where(label == 1), axis=0) - bottom_right_label_position = tf.reduce_max(tf.where(label == 1), axis=0) - random_offset_top_left = tf.random.uniform(shape = [3], minval = [0.0, 0.0, 0.0], maxval = tf.cast(top_left_label_position, dtype=tf.float32)) - random_offset_top_left = tf.cast(random_offset_top_left, dtype = tf.int32) - random_offset_bottom_right = tf.random.uniform(shape = [3], minval = tf.cast(bottom_right_label_position, dtype=tf.float32), maxval = tf.cast(tf.shape(data), dtype=tf.float32)) - random_offset_bottom_right = tf.cast(random_offset_bottom_right, dtype = tf.int32) - finish_prep = time.time() - - if DEBUG_DATALOADER: - print("\ttop_left_label_position:", top_left_label_position.numpy(), "bottom_right_label_position:", bottom_right_label_position.numpy()) - print("\tpancreas shape:", (bottom_right_label_position - top_left_label_position).numpy()) - print("\trandom_offset_top_left:", random_offset_top_left.numpy(), "random_offset_bottom_right:", random_offset_bottom_right.numpy()) - print("\tslice shape:", (random_offset_bottom_right - random_offset_top_left + 1).numpy()) - - start_data = time.time() - _data = cutout_and_resize_tensor(data, random_offset_top_left, random_offset_bottom_right) - finish_data = time.time() - - start_label = time.time() - _label = cutout_and_resize_tensor(label, random_offset_top_left, random_offset_bottom_right) - finish_label = time.time() - - if DEBUG_DATA_LOADING_PERFORMANCE: - print(f"\tDATA_LOADING_PERFORMANCE: prep: {finish_prep - start_prep:.1f} data: {finish_data - start_data:.1f} label: {finish_label - start_label:.1f}") - - if DEBUG_DATALOADER: - print("\t_data shape:", _data.shape, "_label shape:", _label.shape) - - return _data, _label - class FileIterator: def __init__(self, folder): self.folder = folder @@ -127,7 +80,7 @@ def __call__(self): finish_reading = time.time() start_resize = time.time() - data, label = random_slice_including_pancreas(data, label) + # data, label = borders.cut_and_resize_including_pancreas(data, label, np.random.rand(), np.random.rand()) finish_resize = time.time() if DEBUG_DATA_LOADING_PERFORMANCE: @@ -229,13 +182,13 @@ def __next__(self): def __run_through_data_wo_any_action(ds_train, ds_valid): for epoch in range(2): for i, (t, (data, label)) in enumerate(MeasureTime(ds_train)): - print(f"train, epoch/batch {epoch}/{i:02d},\tlatency {t:.1f}\t data shape: {data.shape}\tmean/std: {tf.reduce_mean(tf.cast( data, dtype=tf.float32)).numpy():.3f}/{tf.math.reduce_std(tf.cast( data, dtype=tf.float32)).numpy():.3f}") - print(f"train, epoch/batch {epoch}/{i:02d},\tlatency {t:.1f}\tlabel shape: {label.shape}\tmean/std: {tf.reduce_mean(tf.cast(label, dtype=tf.float32)).numpy():.3f}/{tf.math.reduce_std(tf.cast(label, dtype=tf.float32)).numpy():.3f}") + print(f"train, epoch/batch {epoch}/{i:02d},\tlatency {t:.1f}\t data shape: {data.shape}\tmean/std: {tf.reduce_mean(tf.cast( data, dtype=tf.float32)).numpy():.2f}/{tf.math.reduce_std(tf.cast( data, dtype=tf.float32)).numpy():.2f}") + print(f"train, epoch/batch {epoch}/{i:02d},\tlatency {t:.1f}\tlabel shape: {label.shape}\tmean/std/sum: {tf.reduce_mean(tf.cast(label, dtype=tf.float32)).numpy():.2f}/{tf.math.reduce_std(tf.cast(label, dtype=tf.float32)).numpy():.2f}/{tf.math.reduce_sum(tf.cast(label, dtype=tf.float32)).numpy():.0f}") print("Valid ds:") for i, (t, (data, label)) in enumerate(MeasureTime(ds_valid)): - print(f"valid, epoch/batch {epoch}/{i:02d},\tlatency {t:.1f}\t data shape: {data.shape}\tmean/std: {tf.reduce_mean(tf.cast( data, dtype=tf.float32)).numpy():.3f}/{tf.math.reduce_std(tf.cast( data, dtype=tf.float32)).numpy():.3f}") - print(f"valid, epoch/batch {epoch}/{i:02d},\tlatency {t:.1f}\tlabel shape: {label.shape}\tmean/std: {tf.reduce_mean(tf.cast(label, dtype=tf.float32)).numpy():.3f}/{tf.math.reduce_std(tf.cast(label, dtype=tf.float32)).numpy():.3f}") + print(f"valid, epoch/batch {epoch}/{i:02d},\tlatency {t:.1f}\t data shape: {data.shape}\tmean/std: {tf.reduce_mean(tf.cast( data, dtype=tf.float32)).numpy():.2f}/{tf.math.reduce_std(tf.cast( data, dtype=tf.float32)).numpy():.2f}") + print(f"valid, epoch/batch {epoch}/{i:02d},\tlatency {t:.1f}\tlabel shape: {label.shape}\tmean/std: {tf.reduce_mean(tf.cast(label, dtype=tf.float32)).numpy():.2f}/{tf.math.reduce_std(tf.cast(label, dtype=tf.float32)).numpy():.2f}") diff --git a/dataset/pomc_dataset.py b/dataset/pomc_dataset.py index 067869a..b5f790c 100644 --- a/dataset/pomc_dataset.py +++ b/dataset/pomc_dataset.py @@ -8,6 +8,7 @@ import numpy as np import pydicom import nrrd +import borders currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) parentdir = os.path.dirname(currentdir) @@ -26,6 +27,15 @@ DEBUG = True +# Stored Values (SV) are the values stored in the image pixel data attribute. +# Representation value should be calculated as: +# Rescaled value = SV * Rescale Slope + Rescale Intercept +# https://dicom.innolitics.com/ciods/digital-x-ray-image/dx-image/00281052 +def dcim_slice_stored_value_to_rescaled_value(slice): + rescale_intercept = slice.RescaleIntercept if hasattr(slice, "RescaleIntercept") else 0 + rescale_slope = slice.RescaleSlope if hasattr(slice, "RescaleSlope") else 1 + return slice.pixel_array * rescale_slope + rescale_intercept + class POMCDataset: def __init__(self, patients_src_folder, labels_src_folder, TFRECORD_FOLDER): self.patients_src_folder = patients_src_folder @@ -60,7 +70,8 @@ def read_dicom_data_from_files(files): ] if len(slices): - result = np.stack([_.pixel_array for _ in slices], axis = -1) + result = np.stack([dcim_slice_stored_value_to_rescaled_value(_) for _ in slices], axis = -1) + # result = np.stack([_.pixel_array for _ in slices], axis = -1) else: print("ERROR: can't dcmread from files:", files) @@ -146,6 +157,10 @@ def consistency_check(self, data, label, data_metadata, label_metadata): print("ERROR: label space origin(", label_metadata["space origin"], ") is not close to the data first slice origin(", data_metadata["min"], ")") return False + if np.mean(data) > 1000: + print("ERROR: data mean(", np.mean(data), ") is too high. Probably probleam with reading DCIM data") + return False + ############################## # Thos is not relevant check # ############################## @@ -161,6 +176,7 @@ def print_statistic(self, tensor_old, tensor_new): print("\tmin/max:\t{}/{} -> {}/{}".format(np.min(tensor_old), np.max(tensor_old), np.min(tensor_new), np.max(tensor_new))) print("\tmean std:\t{:.3f} {:.3f} -> {:.3f} {:.3f}".format(np.mean(tensor_old), np.std(tensor_old), np.mean(tensor_new), np.std(tensor_new))) + print("\tsum:\t\t{} -> {}".format(np.sum(tensor_old), np.sum(tensor_new))) def preprocess_data(self, data, label): # zoom = AUGMENT_SCALED_DIMS / data.shape @@ -244,15 +260,15 @@ def sanity_check_after_preprocessing(self, data, label): return result - def save_npy(self, subfolder: str, patient_id:str, original_data, original_label, scaled_data, scaled_label): + def save_npy(self, subfolder: str, patient_id:str, percentage: int, original_data, original_label, scaled_data, scaled_label): result = True scaled_data = np.cast[np.float32](scaled_data) scaled_label = np.cast[np.int8](scaled_label) - np.savez_compressed(os.path.join(self.TFRECORD_FOLDER, subfolder, patient_id + ".npz"), [scaled_data, scaled_label]) + np.savez_compressed(os.path.join(self.TFRECORD_FOLDER, subfolder, patient_id + f"_{percentage}.npz", ), [scaled_data, scaled_label]) return result - def pickle_src_data(self, ratio=0.2): + def pickle_src_data(self, train_valid_percentage=0.2): if not os.path.exists(self.TFRECORD_FOLDER): print("ERROR: can't find TFRecord folder:", self.TFRECORD_FOLDER) return @@ -264,7 +280,7 @@ def pickle_src_data(self, ratio=0.2): folder_list = glob.glob(os.path.join(self.patients_src_folder, "*")) for folder in folder_list: - subfolder = "train" if np.random.rand() > ratio else "valid" + subfolder = "train" if np.random.rand() > train_valid_percentage else "valid" patient_id = self.get_patient_id_from_folder(folder) @@ -293,24 +309,29 @@ def pickle_src_data(self, ratio=0.2): print("ERROR: data & labels are not consistent patient_id:", patient_id) continue - start_ts = time.time() - scaled_src_data, scaled_label_data = self.preprocess_data(src_data, label_data) - print("\tPreprocess data in {:.2f} sec".format(time.time() - start_ts)) + for percentage in [0, 30, 60, 90]: + print(f"\n\tPreprocess data for {percentage}%") + scaled_data, scaled_label = borders.cut_and_resize_including_pancreas(src_data, label_data, percentage/100, percentage/100) - if DEBUG: - print("\tData") - self.print_statistic(src_data, scaled_src_data) - print("\tLabel") - self.print_statistic(label_data, scaled_label_data) + start_ts = time.time() + scaled_src_data, scaled_label_data = self.preprocess_data(scaled_data.numpy(), scaled_label.numpy()) + print("\tPreprocess data in {:.2f} sec".format(time.time() - start_ts)) - if self.sanity_check_after_preprocessing(scaled_src_data, scaled_label_data) == False: - print("ERROR: data or label failed sanity check") - continue + if DEBUG: + print("\tData") + self.print_statistic(src_data, scaled_src_data) + print("\tLabel") + self.print_statistic(label_data, scaled_label_data) + + + if self.sanity_check_after_preprocessing(scaled_src_data, scaled_label_data) == False: + print("ERROR: data or label failed sanity check") + continue - if self.save_npy(subfolder, patient_id, src_data, label_data, scaled_src_data, scaled_label_data) == False: - print("ERROR: can't save TFRecord patient id:", patient_id) + if self.save_npy(subfolder, patient_id, percentage, src_data, label_data, scaled_src_data, scaled_label_data) == False: + print("ERROR: can't save TFRecord patient id:", patient_id) def main(): diff --git a/predict.py b/predict.py index 0c65b7c..5573674 100644 --- a/predict.py +++ b/predict.py @@ -47,7 +47,7 @@ def __preprocess_data(self, data): return data - def __create_mask(self, data): + def __create_segmentation(self, data): mask = tf.argmax(data, axis = -1) mask = mask[..., tf.newaxis] return mask @@ -103,7 +103,7 @@ def __get_affine_matrix(self, dcm_slices): return affine - def __resize_mask_to_dcm_shape(self, mask, dcm_slices): + def __resize_segmentation_to_dcm_shape(self, mask, dcm_slices): dcm_rows = dcm_slices[0][0x0028, 0x0010].value dcm_columns = dcm_slices[0][0x0028, 0x0011].value dcm_depth = len(dcm_slices) @@ -135,9 +135,9 @@ def main(self, dcm_folder, result_file_name): # model.summary() - prediction = model(src_data) - mask = self.__create_mask(prediction) - mask = self.__resize_mask_to_dcm_shape(mask, dcm_slices) + prediction = model.predict(src_data) + mask = self.__create_segmentation(prediction) + mask = self.__resize_segmentation_to_dcm_shape(mask, dcm_slices) mask = tf.squeeze(mask) affine_matrix = self.__get_affine_matrix(dcm_slices)