diff --git a/CHANGELOG b/CHANGELOG
index f400a65b..bc2a3531 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -6,7 +6,7 @@ All notable changes to this project will be documented in this file.
The format is based on `Keep a Changelog
[docs]def load_config(config_file_path):
- """ Parse a WORC configuration file.
+ """Parse a WORC configuration file.
Arguments:
config_file_path: path to the configuration file to be parsed.
diff --git a/WORC/doc/_build/html/_modules/WORC/IOparser/config_io_classifier.html b/WORC/doc/_build/html/_modules/WORC/IOparser/config_io_classifier.html
index 104a52ce..baeae203 100644
--- a/WORC/doc/_build/html/_modules/WORC/IOparser/config_io_classifier.html
+++ b/WORC/doc/_build/html/_modules/WORC/IOparser/config_io_classifier.html
@@ -8,7 +8,7 @@
- WORC.IOparser.config_io_classifier — WORC 3.5.0 documentation
+ WORC.IOparser.config_io_classifier — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -165,7 +165,7 @@
Source code for WORC.IOparser.config_io_classifier
#!/usr/bin/env python
-# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
+# Copyright 2016-2022 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -210,7 +210,7 @@ Source code for WORC.IOparser.config_io_classifier
'Resampling': dict(), 'Imputation': dict(),
'Ensemble': dict(), 'Bootstrap': dict(),
'FeatPreProcess': dict(), 'Evaluation': dict(),
- 'OneHotEncoding': dict()}
+ 'OneHotEncoding': dict(), 'SMAC': dict()}
settings_dict['General']['cross_validation'] =\
settings['General'].getboolean('cross_validation')
@@ -496,6 +496,31 @@ Source code for WORC.IOparser.config_io_classifier
[float(str(item).strip()) for item in
settings['Classification']['XGB_colsample_bytree'].split(',')]
+ # Light GBM
+ settings_dict['Classification']['LightGBM_num_leaves'] =\
+ [int(str(item).strip()) for item in
+ settings['Classification']['LightGBM_num_leaves'].split(',')]
+
+ settings_dict['Classification']['LightGBM_max_depth'] =\
+ [int(str(item).strip()) for item in
+ settings['Classification']['LightGBM_max_depth'].split(',')]
+
+ settings_dict['Classification']['LightGBM_min_child_samples'] =\
+ [int(str(item).strip()) for item in
+ settings['Classification']['LightGBM_min_child_samples'].split(',')]
+
+ settings_dict['Classification']['LightGBM_reg_alpha'] =\
+ [float(str(item).strip()) for item in
+ settings['Classification']['LightGBM_reg_alpha'].split(',')]
+
+ settings_dict['Classification']['LightGBM_reg_lambda'] =\
+ [float(str(item).strip()) for item in
+ settings['Classification']['LightGBM_reg_lambda'].split(',')]
+
+ settings_dict['Classification']['LightGBM_min_child_weight'] =\
+ [int(str(item).strip()) for item in
+ settings['Classification']['LightGBM_min_child_weight'].split(',')]
+
# Cross validation settings
settings_dict['CrossValidation']['Type'] =\
str(settings['CrossValidation']['Type'])
@@ -537,9 +562,26 @@ Source code for WORC.IOparser.config_io_classifier
settings_dict['HyperOptimization']['memory'] = \
str(settings['HyperOptimization']['memory'])
+ # Settings for SMAC
+ settings_dict['SMAC']['use'] =\
+ settings['SMAC'].getboolean('use')
+ settings_dict['SMAC']['n_smac_cores'] =\
+ int(settings['SMAC']['n_smac_cores'])
+ settings_dict['SMAC']['budget_type'] =\
+ str(settings['SMAC']['budget_type'])
+ settings_dict['SMAC']['budget'] =\
+ int(settings['SMAC']['budget'])
+ settings_dict['SMAC']['init_method'] =\
+ str(settings['SMAC']['init_method'])
+ settings_dict['SMAC']['init_budget'] =\
+ int(settings['SMAC']['init_budget'])
+
# Settings for ensembling
- settings_dict['Ensemble']['Use'] =\
- settings['Ensemble'].getint('Use')
+ settings_dict['Ensemble']['Method'] =\
+ str(settings['Ensemble']['Method'])
+ settings_dict['Ensemble']['Size'] =\
+ int(settings['Ensemble']['Size'])
+ #settings['Ensemble'].getint('Use')
settings_dict['Ensemble']['Metric'] =\
settings['Ensemble']['Metric']
diff --git a/WORC/doc/_build/html/_modules/WORC/IOparser/config_preprocessing.html b/WORC/doc/_build/html/_modules/WORC/IOparser/config_preprocessing.html
index 28521e7b..7699458f 100644
--- a/WORC/doc/_build/html/_modules/WORC/IOparser/config_preprocessing.html
+++ b/WORC/doc/_build/html/_modules/WORC/IOparser/config_preprocessing.html
@@ -8,7 +8,7 @@
- WORC.IOparser.config_preprocessing — WORC 3.5.0 documentation
+ WORC.IOparser.config_preprocessing — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/IOparser/config_segmentix.html b/WORC/doc/_build/html/_modules/WORC/IOparser/config_segmentix.html
index ea6a5109..a99098e7 100644
--- a/WORC/doc/_build/html/_modules/WORC/IOparser/config_segmentix.html
+++ b/WORC/doc/_build/html/_modules/WORC/IOparser/config_segmentix.html
@@ -8,7 +8,7 @@
- WORC.IOparser.config_segmentix — WORC 3.5.0 documentation
+ WORC.IOparser.config_segmentix — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/IOparser/file_io.html b/WORC/doc/_build/html/_modules/WORC/IOparser/file_io.html
index 400d5530..f3fab5bc 100644
--- a/WORC/doc/_build/html/_modules/WORC/IOparser/file_io.html
+++ b/WORC/doc/_build/html/_modules/WORC/IOparser/file_io.html
@@ -8,7 +8,7 @@
- WORC.IOparser.file_io — WORC 3.5.0 documentation
+ WORC.IOparser.file_io — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/WORC.html b/WORC/doc/_build/html/_modules/WORC/WORC.html
index 3b413dec..526064a5 100644
--- a/WORC/doc/_build/html/_modules/WORC/WORC.html
+++ b/WORC/doc/_build/html/_modules/WORC/WORC.html
@@ -8,7 +8,7 @@
- WORC.WORC — WORC 3.5.0 documentation
+ WORC.WORC — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -165,7 +165,7 @@
Source code for WORC.WORC
#!/usr/bin/env python
-# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
+# Copyright 2016-2022 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -198,6 +198,7 @@ Source code for WORC.WORC
from WORC.export.hyper_params_exporter import export_hyper_params_to_latex
from urllib.parse import urlparse
from urllib.request import url2pathname
+from WORC.tools.fingerprinting import quantitative_modalities, qualitative_modalities, all_modalities
[docs]class WORC(object):
@@ -335,6 +336,7 @@ Source code for WORC.WORC
self.fastr_memory_parameters['Segmentix'] = '6G'
self.fastr_memory_parameters['ComBat'] = '12G'
self.fastr_memory_parameters['PlotEstimator'] = '12G'
+ self.fastr_memory_parameters['Fingerprinter'] = '12G'
if DebugDetector().do_detection():
print(fastr.config)
@@ -359,9 +361,14 @@ Source code for WORC.WORC
config['General']['TransformationNode'] = "elastix4.8/Transformix:4.8"
config['General']['Joblib_ncores'] = '1'
config['General']['Joblib_backend'] = 'threading'
- config['General']['tempsave'] = 'False'
+ config['General']['tempsave'] = 'True'
config['General']['AssumeSameImageAndMaskMetadata'] = 'False'
config['General']['ComBat'] = 'False'
+ config['General']['Fingerprint'] = 'True'
+
+ # Fingerprinting
+ config['Fingerprinting'] = dict()
+ config['Fingerprinting']['max_num_image'] = '100'
# Options for the object/patient labels that are used
config['Labels'] = dict()
@@ -390,7 +397,7 @@ Source code for WORC.WORC
# Segmentix
config['Segmentix'] = dict()
- config['Segmentix']['mask'] = 'subtract'
+ config['Segmentix']['mask'] = 'None'
config['Segmentix']['segtype'] = 'None'
config['Segmentix']['segradius'] = '5'
config['Segmentix']['N_blobs'] = '1'
@@ -418,7 +425,10 @@ Source code for WORC.WORC
# Parameter settings for PREDICT feature calculation
# Defines only naming of modalities
- config['ImageFeatures']['image_type'] = 'CT'
+ config['ImageFeatures']['image_type'] = ''
+
+ # How to extract the features in different dimension
+ config['ImageFeatures']['extraction_mode'] = '2.5D'
# Define frequencies for gabor filter in pixels
config['ImageFeatures']['gabor_frequencies'] = '0.05, 0.2, 0.5'
@@ -626,6 +636,14 @@ Source code for WORC.WORC
config['Classification']['XGB_min_child_weight'] = '1, 6'
config['Classification']['XGB_colsample_bytree'] = '0.3, 0.7'
+ # https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html. Mainly prevent overfitting
+ config['Classification']['LightGBM_num_leaves'] = '5, 95' # Default 31 so search around that
+ config['Classification']['LightGBM_max_depth'] = config['Classification']['XGB_max_depth'] # Good to limit explicitly to decrease compuytation time and limit overfitting
+ config['Classification']['LightGBM_min_child_samples'] = '5, 45' # = min_data_in_leaf. Default 20
+ config['Classification']['LightGBM_reg_alpha'] = config['Classification']['LRC']
+ config['Classification']['LightGBM_reg_lambda'] = config['Classification']['LRC']
+ config['Classification']['LightGBM_min_child_weight'] = '-7, 4' # Default 1e-3
+
# CrossValidation
config['CrossValidation'] = dict()
config['CrossValidation']['Type'] = 'random_split'
@@ -638,16 +656,26 @@ Source code for WORC.WORC
config['HyperOptimization']['scoring_method'] = 'f1_weighted'
config['HyperOptimization']['test_size'] = '0.2'
config['HyperOptimization']['n_splits'] = '5'
- config['HyperOptimization']['N_iterations'] = '1000'
+ config['HyperOptimization']['N_iterations'] = '1000' # represents either wallclock time limit or nr of evaluations when using SMAC
config['HyperOptimization']['n_jobspercore'] = '200' # only relevant when using fastr in classification
config['HyperOptimization']['maxlen'] = '100'
config['HyperOptimization']['ranking_score'] = 'test_score'
config['HyperOptimization']['memory'] = '3G'
config['HyperOptimization']['refit_workflows'] = 'False'
+ # SMAC options
+ config['SMAC'] = dict()
+ config['SMAC']['use'] = 'False'
+ config['SMAC']['n_smac_cores'] = '1'
+ config['SMAC']['budget_type'] = 'evals' # ['evals', 'time']
+ config['SMAC']['budget'] = '100' # Nr of evals or time in seconds
+ config['SMAC']['init_method'] = 'random' # ['sobol', 'random']
+ config['SMAC']['init_budget'] = '20' # Nr of evals
+
# Ensemble options
config['Ensemble'] = dict()
- config['Ensemble']['Use'] = '100'
+ config['Ensemble']['Method'] = 'top_N' # ['Single', 'top_N', 'FitNumber', 'ForwardSelection', 'Caruana', 'Bagging']
+ config['Ensemble']['Size'] = '100' # Size of ensemble in top_N, or number of bags in Bagging
config['Ensemble']['Metric'] = 'Default'
# Evaluation options
@@ -657,7 +685,7 @@ Source code for WORC.WORC
# Bootstrap options
config['Bootstrap'] = dict()
config['Bootstrap']['Use'] = 'False'
- config['Bootstrap']['N_iterations'] = '1000'
+ config['Bootstrap']['N_iterations'] = '10000'
return config
@@ -699,9 +727,10 @@ Source code for WORC.WORC
self.configs = [self.defaultconfig()] * len(self.images_train)
else:
self.configs = [self.defaultconfig()] * len(self.features_train)
+
self.network = fastr.create_network(self.name)
- # BUG: We currently use the first configuration as general config
+ # NOTE: We currently use the first configuration as general config
image_types = list()
for c in range(len(self.configs)):
if type(self.configs[c]) == str:
@@ -709,6 +738,10 @@ Source code for WORC.WORC
self.configs[c] = config_io.load_config(self.configs[c])
image_types.append(self.configs[c]['ImageFeatures']['image_type'])
+ if self.configs[0]['General']['Fingerprint'] == 'True' and any(imt not in all_modalities for imt in image_types):
+ m = f'One of your image types {image_types} is not one of the valid image types {quantitative_modalities + qualitative_modalities}. This is mandatory to set when performing fingerprinting, see the WORC Documentation (https://worc.readthedocs.io/en/latest/static/configuration.html#imagefeatures).'
+ raise WORCexceptions.WORCValueError(m)
+
# Create config source
self.source_class_config = self.network.create_source('ParameterFile', id='config_classification_source', node_group='conf', step_id='general_sources')
@@ -717,6 +750,7 @@ Source code for WORC.WORC
if self.labels_test:
self.source_patientclass_test = self.network.create_source('PatientInfoFile', id='patientclass_test', node_group='pctest', step_id='test_sources')
+ # Add classification node
memory = self.fastr_memory_parameters['Classification']
self.classify = self.network.create_node('worc/TrainClassifier:1.0',
tool_version='1.0',
@@ -724,14 +758,35 @@ Source code for WORC.WORC
resources=ResourceLimit(memory=memory),
step_id='WorkflowOptimization')
+ # Add fingerprinting
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.node_fingerprinters = dict()
+ self.links_fingerprinting = dict()
+
+ self.add_fingerprinter(id='classification', type='classification', config_source=self.source_class_config.output)
+
+ # Link output of fingerprinter to classification node
+ self.link_class_1 = self.network.create_link(self.node_fingerprinters['classification'].outputs['config'], self.classify.inputs['config'])
+ # self.link_class_1.collapse = 'conf'
+ else:
+ # Directly parse config to classify node
+ self.link_class_1 = self.network.create_link(self.source_class_config.output, self.classify.inputs['config'])
+ self.link_class_1.collapse = 'conf'
+
if self.fixedsplits:
self.fixedsplits_node = self.network.create_source('CSVFile', id='fixedsplits_source', node_group='conf', step_id='general_sources')
self.classify.inputs['fixedsplits'] = self.fixedsplits_node.output
- self.source_Ensemble =\
- self.network.create_constant('String', [self.configs[0]['Ensemble']['Use']],
- id='Ensemble',
+ self.source_ensemble_method =\
+ self.network.create_constant('String', [self.configs[0]['Ensemble']['Method']],
+ id='ensemble_method',
+ step_id='Evaluation')
+
+ self.source_ensemble_size =\
+ self.network.create_constant('String', [self.configs[0]['Ensemble']['Size']],
+ id='ensemble_size',
step_id='Evaluation')
+
self.source_LabelType =\
self.network.create_constant('String', [self.configs[0]['Labels']['label_names']],
id='LabelType',
@@ -750,13 +805,16 @@ Source code for WORC.WORC
self.sink_class_config = self.network.create_sink('ParameterFile', id='config_classification_sink', node_group='conf', step_id='general_sinks')
# Links
- self.sink_class_config.input = self.source_class_config.output
- self.link_class_1 = self.network.create_link(self.source_class_config.output, self.classify.inputs['config'])
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.sink_class_config.input = self.node_fingerprinters['classification'].outputs['config']
+ else:
+ self.sink_class_config.input = self.source_class_config.output
+
self.link_class_2 = self.network.create_link(self.source_patientclass_train.output, self.classify.inputs['patientclass_train'])
- self.link_class_1.collapse = 'conf'
self.link_class_2.collapse = 'pctrain'
- self.plot_estimator.inputs['ensemble'] = self.source_Ensemble.output
+ self.plot_estimator.inputs['ensemble_method'] = self.source_ensemble_method.output
+ self.plot_estimator.inputs['ensemble_size'] = self.source_ensemble_size.output
self.plot_estimator.inputs['label_type'] = self.source_LabelType.output
if self.labels_test:
@@ -767,6 +825,12 @@ Source code for WORC.WORC
self.plot_estimator.inputs['prediction'] = self.classify.outputs['classification']
self.plot_estimator.inputs['pinfo'] = pinfo
+ # Optional SMAC output
+ if self.configs[0]['SMAC']['use'] == 'True':
+ self.sink_smac_results = self.network.create_sink('JsonFile', id='smac_results',
+ step_id='general_sinks')
+ self.sink_smac_results.input = self.classify.outputs['smac_results']
+
if self.TrainTest:
# FIXME: the naming here is ugly
self.link_class_3 = self.network.create_link(self.source_patientclass_test.output, self.classify.inputs['patientclass_test'])
@@ -800,6 +864,7 @@ Source code for WORC.WORC
self.preprocessing_train = dict()
self.sources_images_train = dict()
self.sinks_features_train = dict()
+ self.sinks_configs = dict()
self.converters_im_train = dict()
self.converters_seg_train = dict()
self.links_C1_train = dict()
@@ -887,7 +952,9 @@ Source code for WORC.WORC
self.modlabels.append(label)
# Create required sources and sinks
- self.sources_parameters[label] = self.network.create_source('ParameterFile', id='config_' + label, step_id='general_sources')
+ self.sources_parameters[label] = self.network.create_source('ParameterFile', id=f'config_{label}', step_id='general_sources')
+ self.sinks_configs[label] = self.network.create_sink('ParameterFile', id=f'config_{label}_sink', node_group='conf', step_id='general_sinks')
+
self.sources_images_train[label] = self.network.create_source('ITKImageFile', id='images_train_' + label, node_group='train', step_id='train_sources')
if self.TrainTest:
self.sources_images_test[label] = self.network.create_source('ITKImageFile', id='images_test_' + label, node_group='test', step_id='test_sources')
@@ -927,7 +994,7 @@ Source code for WORC.WORC
self.converters_masks_test[label].inputs['image'] = self.sources_masks_test[label].output
# First convert the images
- if any(modality in mod for modality in ['MR', 'CT', 'MG', 'PET']):
+ if any(modality in mod for modality in all_modalities):
# Use WORC PXCastConvet for converting image formats
memory = self.fastr_memory_parameters['WORCCastConvert']
self.converters_im_train[label] =\
@@ -952,6 +1019,23 @@ Source code for WORC.WORC
if self.TrainTest:
self.converters_im_test[label].inputs['image'] = self.sources_images_test[label].output
+ # -----------------------------------------------------
+ # Add fingerprinting
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.add_fingerprinter(id=label, type='images', config_source=self.sources_parameters[label].output)
+ self.links_fingerprinting[f'{label}_images'] = self.network.create_link(self.converters_im_train[label].outputs['image'], self.node_fingerprinters[label].inputs['images_train'])
+ self.links_fingerprinting[f'{label}_images'].collapse = 'train'
+
+ self.sinks_configs[label].input = self.node_fingerprinters[label].outputs['config']
+
+ if nmod == 0:
+ # Also add images from first modality for classification fingerprinter
+ self.links_fingerprinting['classification'] = self.network.create_link(self.converters_im_train[label].outputs['image'], self.node_fingerprinters['classification'].inputs['images_train'])
+ self.links_fingerprinting['classification'].collapse = 'train'
+
+ else:
+ self.sinks_configs[label].input = self.sources_parameters[label].output
+
# -----------------------------------------------------
# Preprocessing
preprocess_node = str(self.configs[nmod]['General']['Preprocessing'])
@@ -1015,6 +1099,11 @@ Source code for WORC.WORC
self.converters_seg_test[label].inputs['image'] =\
self.sources_segmentations_test[label].output
+ # Add to fingerprinting if required
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.links_fingerprinting[f'{label}_segmentations'] = self.network.create_link(self.converters_seg_train[label].outputs['image'], self.node_fingerprinters[label].inputs['segmentations_train'])
+ self.links_fingerprinting[f'{label}_segmentations'].collapse = 'train'
+
elif self.segmode == 'Register':
# ---------------------------------------------
# Registration nodes: Align segmentation of first
@@ -1146,12 +1235,49 @@ Source code for WORC.WORC
self.links_C1_test[label] = self.classify.inputs['features_test'][str(label)] << self.sources_features_test[label].output
self.links_C1_test[label].collapse = 'test'
+ # Add input to fingerprinting for classification
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ if num == 0:
+ self.links_fingerprinting['classification'] = self.network.create_link(self.sources_features_train[label].output, self.node_fingerprinters['classification'].inputs['features_train'])
+ self.links_fingerprinting['classification'].collapse = 'train'
else:
raise WORCexceptions.WORCIOError("Please provide labels.")
else:
raise WORCexceptions.WORCIOError("Please provide either images or features.")
+[docs] def add_fingerprinter(self, id, type, config_source):
+ """Add WORC Fingerprinter to the network.
+
+ Note: applied per imaging sequence, or per feature file if no
+ images are present.
+ """
+ # Add fingerprinting tool
+ memory = self.fastr_memory_parameters['Fingerprinter']
+ fingerprinter_node = self.network.create_node('worc/Fingerprinter:1.0',
+ tool_version='1.0',
+ id=f'fingerprinter_{id}',
+ resources=ResourceLimit(memory=memory),
+ step_id='FingerPrinting')
+
+ # Add general sources to fingerprinting node
+ fingerprinter_node.inputs['config'] = config_source
+ fingerprinter_node.inputs['patientclass_train'] = self.source_patientclass_train.output
+
+ # Add type input
+ valid_types = ['classification', 'images']
+ if type not in valid_types:
+ raise WORCexceptions.WORCValueError(f'Type {type} is not valid for fingeprinting. Should be one of {valid_types}.')
+
+ type_node = self.network.create_constant('String', type,
+ id=f'type_fingerprint_{id}',
+ node_group='train',
+ step_id='FingerPrinting')
+ fingerprinter_node.inputs['type'] = type_node.output
+
+ # Add to list of fingerprinting nodes
+ self.node_fingerprinters[id] = fingerprinter_node
+
[docs] def add_ComBat(self):
"""Add ComBat harmonization to the network.
@@ -1169,7 +1295,11 @@ Source code for WORC.WORC
self.sinks_features_train_ComBat = self.network.create_sink('HDF5', id='features_train_ComBat', step_id='ComBat')
# Create links for inputs
- self.link_combat_1 = self.network.create_link(self.source_class_config.output, self.ComBat.inputs['config'])
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.link_combat_1 = self.network.create_link(self.node_fingerprinters['classification'].outputs['config'], self.ComBat.inputs['config'])
+ else:
+ self.link_combat_1 = self.network.create_link(self.source_class_config.output, self.ComBat.inputs['config'])
+
self.link_combat_2 = self.network.create_link(self.source_patientclass_train.output, self.ComBat.inputs['patientclass_train'])
self.link_combat_1.collapse = 'conf'
self.link_combat_2.collapse = 'pctrain'
@@ -1202,11 +1332,19 @@ Source code for WORC.WORC
self.preprocessing_test[label] = self.network.create_node(preprocess_node, tool_version='1.0', id='preprocessing_test_' + label, resources=ResourceLimit(memory=memory), step_id='Preprocessing')
# Create required links
- self.preprocessing_train[label].inputs['parameters'] = self.sources_parameters[label].output
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.preprocessing_train[label].inputs['parameters'] = self.node_fingerprinters[label].outputs['config']
+ else:
+ self.preprocessing_train[label].inputs['parameters'] = self.sources_parameters[label].output
+
self.preprocessing_train[label].inputs['image'] = self.converters_im_train[label].outputs['image']
if self.TrainTest:
- self.preprocessing_test[label].inputs['parameters'] = self.sources_parameters[label].output
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.preprocessing_test[label].inputs['parameters'] = self.node_fingerprinters[label].outputs['config']
+ else:
+ self.preprocessing_test[label].inputs['parameters'] = self.sources_parameters[label].output
+
self.preprocessing_test[label].inputs['image'] = self.converters_im_test[label].outputs['image']
if self.metadata_train and len(self.metadata_train) >= nmod + 1:
@@ -1249,12 +1387,13 @@ Source code for WORC.WORC
# Check if we need to add pyradiomics specific sources
if 'pyradiomics' in calcfeat_node.lower():
- # Add a config source
- self.source_config_pyradiomics[label] =\
- self.network.create_source('YamlFile',
- id='config_pyradiomics_' + label,
- node_group='train',
- step_id='Feature_Extraction')
+ if self.configs[0]['General']['Fingerprint'] != 'True':
+ # Add a config source
+ self.source_config_pyradiomics[label] =\
+ self.network.create_source('YamlFile',
+ id='config_pyradiomics_' + label,
+ node_group='train',
+ step_id='Feature_Extraction')
# Add a format source, which we are going to set to a constant
# And attach to the tool node
@@ -1273,22 +1412,38 @@ Source code for WORC.WORC
# Create required links
# We can have a different config for different tools
if 'pyradiomics' in calcfeat_node.lower():
- node_train.inputs['parameters'] =\
- self.source_config_pyradiomics[label].output
+ if self.configs[0]['General']['Fingerprint'] != 'True':
+ node_train.inputs['parameters'] =\
+ self.source_config_pyradiomics[label].output
+ else:
+ node_train.inputs['parameters'] =\
+ self.node_fingerprinters[label].outputs['config_pyradiomics']
else:
- node_train.inputs['parameters'] =\
- self.sources_parameters[label].output
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ node_train.inputs['parameters'] =\
+ self.node_fingerprinters[label].outputs['config']
+ else:
+ node_train.inputs['parameters'] =\
+ self.sources_parameters[label].output
node_train.inputs['image'] =\
self.preprocessing_train[label].outputs['image']
if self.TrainTest:
if 'pyradiomics' in calcfeat_node.lower():
- node_test.inputs['parameters'] =\
- self.source_config_pyradiomics[label].output
+ if self.configs[0]['General']['Fingerprint'] != 'True':
+ node_test.inputs['parameters'] =\
+ self.source_config_pyradiomics[label].output
+ else:
+ node_test.inputs['parameters'] =\
+ self.node_fingerprinters[label].outputs['config_pyradiomics']
else:
- node_test.inputs['parameters'] =\
- self.sources_parameters[label].output
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ node_test.inputs['parameters'] =\
+ self.node_fingerprinters[label].outputs['config']
+ else:
+ node_test.inputs['parameters'] =\
+ self.sources_parameters[label].output
node_test.inputs['image'] =\
self.preprocessing_test[label].outputs['image']
@@ -1346,7 +1501,11 @@ Source code for WORC.WORC
step_id='Feature_Extraction')
conv_train.inputs['toolbox'] = self.source_toolbox_name[label].output
- conv_train.inputs['config'] = self.sources_parameters[label].output
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ conv_train.inputs['config'] =\
+ self.node_fingerprinters[label].outputs['config']
+ else:
+ conv_train.inputs['config'] = self.sources_parameters[label].output
if self.TrainTest:
conv_test =\
@@ -1358,7 +1517,12 @@ Source code for WORC.WORC
conv_test.inputs['feat_in'] = node_test.outputs['features']
conv_test.inputs['toolbox'] = self.source_toolbox_name[label].output
- conv_test.inputs['config'] = self.sources_parameters[label].output
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ conv_test.inputs['config'] =\
+ self.node_fingerprinters[label].outputs['config']
+ else:
+ conv_test.inputs['config'] =\
+ self.sources_parameters[label].output
# Append to nodes to list
self.calcfeatures_train[label].append(node_train)
@@ -1642,6 +1806,11 @@ Source code for WORC.WORC
self.calcfeatures_test[label][i_node].inputs['segmentation'] =\
self.transformix_seg_nodes_test[label].outputs['image']
+ # Add to fingerprinting if required
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.links_fingerprinting[f'{label}_segmentations'] = self.network.create_link(self.transformix_seg_nodes_train[label].outputs['image'], self.node_fingerprinters[label].inputs['segmentations_train'])
+ self.links_fingerprinting[f'{label}_segmentations'].collapse = 'train'
+
# Save outputfor the training set
self.sinks_transformations_train[label] =\
self.network.create_sink('ElastixTransformFile',
@@ -1732,8 +1901,13 @@ Source code for WORC.WORC
self.converters_seg_train[label].outputs['image']
# Input the parameters
- self.nodes_segmentix_train[label].inputs['parameters'] =\
- self.sources_parameters[label].output
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.nodes_segmentix_train[label].inputs['parameters'] =\
+ self.node_fingerprinters[label].outputs['config']
+ else:
+ self.nodes_segmentix_train[label].inputs['parameters'] =\
+ self.sources_parameters[label].output
+
self.sinks_segmentations_segmentix_train[label].input =\
self.nodes_segmentix_train[label].outputs['segmentation_out']
@@ -1772,8 +1946,13 @@ Source code for WORC.WORC
self.nodes_segmentix_test[label].inputs['segmentation_in'] =\
self.converters_seg_test[label].outputs['image']
- self.nodes_segmentix_test[label].inputs['parameters'] =\
- self.sources_parameters[label].output
+ if self.configs[0]['General']['Fingerprint'] == 'True':
+ self.nodes_segmentix_test[label].inputs['parameters'] =\
+ self.node_fingerprinters[label].outputs['config']
+ else:
+ self.nodes_segmentix_test[label].inputs['parameters'] =\
+ self.sources_parameters[label].output
+
self.sinks_segmentations_segmentix_test[label].input =\
self.nodes_segmentix_test[label].outputs['segmentation_out']
@@ -1785,7 +1964,6 @@ Source code for WORC.WORC
self.calcfeatures_test[label][i_node].inputs['segmentation'] =\
self.nodes_segmentix_test[label].outputs['segmentation_out']
-
if self.masks_train and len(self.masks_train) >= nmod + 1:
# Use masks
self.nodes_segmentix_train[label].inputs['mask'] =\
@@ -1807,10 +1985,7 @@ Source code for WORC.WORC
# fixed splits
if self.fixedsplits:
- self.source_data['fixedsplits_source'] = self.fixedsplits
-
- # Generate gridsearch parameter files if required
- self.source_data['config_classification_source'] = self.fastrconfigs[0]
+ self.source_data['fixedsplits_source'] = self.fixedsplits
# Set source and sink data
self.source_data['patientclass_train'] = self.labels_train
@@ -1818,16 +1993,17 @@ Source code for WORC.WORC
self.sink_data['classification'] = ("vfs://output/{}/estimator_{{sample_id}}_{{cardinality}}{{ext}}").format(self.name)
self.sink_data['performance'] = ("vfs://output/{}/performance_{{sample_id}}_{{cardinality}}{{ext}}").format(self.name)
+ self.sink_data['smac_results'] = ("vfs://output/{}/smac_results_{{sample_id}}_{{cardinality}}{{ext}}").format(self.name)
self.sink_data['config_classification_sink'] = ("vfs://output/{}/config_{{sample_id}}_{{cardinality}}{{ext}}").format(self.name)
self.sink_data['features_train_ComBat'] = ("vfs://output/{}/ComBat/features_ComBat_{{sample_id}}_{{cardinality}}{{ext}}").format(self.name)
self.sink_data['features_test_ComBat'] = ("vfs://output/{}/ComBat/features_ComBat_{{sample_id}}_{{cardinality}}{{ext}}").format(self.name)
-
-
# Set the source data from the WORC objects you created
for num, label in enumerate(self.modlabels):
self.source_data['config_' + label] = self.fastrconfigs[num]
- if self.pyradiomics_configs:
+ self.sink_data[f'config_{label}_sink'] = f"vfs://output/{self.name}/config_{label}_{{sample_id}}_{{cardinality}}{{ext}}"
+
+ if 'pyradiomics' in self.configs[0]['General']['FeatureCalculators'] and self.configs[0]['General']['Fingerprint'] != 'True':
self.source_data['config_pyradiomics_' + label] = self.pyradiomics_configs[num]
# Add train data sources
@@ -1908,7 +2084,14 @@ Source code for WORC.WORC
self.sink_data['transformations_test_' + label] = ("vfs://output/{}/Elastix/transformation_{}_{{sample_id}}_{{cardinality}}{{ext}}").format(self.name, label)
if self._add_evaluation:
- self.Evaluate.set()
+ self.Evaluate.set()
+
+ # Generate gridsearch parameter files if required
+ self.source_data['config_classification_source'] = self.fastrconfigs[0]
+
+ # Give configuration sources to WORC
+ for num, label in enumerate(self.modlabels):
+ self.source_data['config_' + label] = self.fastrconfigs[num]
[docs] def execute(self):
"""Execute the network through the fastr.network.execute command."""
@@ -1970,14 +2153,16 @@ Source code for WORC.WORC
config = configparser.ConfigParser()
config.read(c)
c = config
+
cfile = os.path.join(self.fastr_tmpdir, f"config_{self.name}_{num}.ini")
if not os.path.exists(os.path.dirname(cfile)):
os.makedirs(os.path.dirname(cfile))
+
with open(cfile, 'w') as configfile:
c.write(configfile)
- # If PyRadiomics is used, also write a config for PyRadiomics
- if 'pyradiomics' in c['General']['FeatureCalculators']:
+ # If PyRadiomics is used and there is no finterprinting, also write a config for PyRadiomics
+ if 'pyradiomics' in c['General']['FeatureCalculators'] and self.configs[0]['General']['Fingerprint'] != 'True':
cfile_pyradiomics = os.path.join(self.fastr_tmpdir, f"config_pyradiomics_{self.name}_{num}.yaml")
config_pyradiomics = io.convert_config_pyradiomics(c)
with open(cfile_pyradiomics, 'w') as file:
diff --git a/WORC/doc/_build/html/_modules/WORC/addexceptions.html b/WORC/doc/_build/html/_modules/WORC/addexceptions.html
index b751c379..dbd4270e 100644
--- a/WORC/doc/_build/html/_modules/WORC/addexceptions.html
+++ b/WORC/doc/_build/html/_modules/WORC/addexceptions.html
@@ -8,7 +8,7 @@
- WORC.addexceptions — WORC 3.5.0 documentation
+ WORC.addexceptions — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/AdvancedSampler.html b/WORC/doc/_build/html/_modules/WORC/classification/AdvancedSampler.html
index 7422e845..0e554819 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/AdvancedSampler.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/AdvancedSampler.html
@@ -8,7 +8,7 @@
- WORC.classification.AdvancedSampler — WORC 3.5.0 documentation
+ WORC.classification.AdvancedSampler — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/ObjectSampler.html b/WORC/doc/_build/html/_modules/WORC/classification/ObjectSampler.html
index 0ba21179..6c8bf1fa 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/ObjectSampler.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/ObjectSampler.html
@@ -8,7 +8,7 @@
- WORC.classification.ObjectSampler — WORC 3.5.0 documentation
+ WORC.classification.ObjectSampler — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/RankedSVM.html b/WORC/doc/_build/html/_modules/WORC/classification/RankedSVM.html
index d6cacddb..8485a0fd 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/RankedSVM.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/RankedSVM.html
@@ -8,7 +8,7 @@
- WORC.classification.RankedSVM — WORC 3.5.0 documentation
+ WORC.classification.RankedSVM — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/SearchCV.html b/WORC/doc/_build/html/_modules/WORC/classification/SearchCV.html
index 6702946e..237308ee 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/SearchCV.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/SearchCV.html
@@ -8,7 +8,7 @@
- WORC.classification.SearchCV — WORC 3.5.0 documentation
+ WORC.classification.SearchCV — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -221,6 +221,11 @@ Source code for WORC.classification.SearchCV
from WORC.detectors.detectors import DebugDetector
import WORC.addexceptions as WORCexceptions
+# Imports used in the Bayesian optimization
+from WORC.classification.smac import build_smac_config
+from datetime import datetime
+import copy
+
[docs]def rms_score(truth, prediction):
"""Root-mean-square-error metric."""
@@ -522,10 +527,10 @@ Source code for WORC.classification.SearchCV
def __init__(self, param_distributions={}, n_iter=10, scoring=None,
fit_params=None, n_jobs=1, iid=True,
refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs',
- random_state=None, error_score='raise',
- return_train_score=True,
+ random_state=None, error_score='raise', return_train_score=True,
n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G',
- ranking_score='test_score', refit_workflows=False):
+ ranking_score='test_score', refit_workflows=False,
+ ensemble_validation_score=None):
"""Initialize SearchCV Object."""
# Added for fastr and joblib executions
self.param_distributions = param_distributions
@@ -535,6 +540,7 @@ Source code for WORC.classification.SearchCV
self.ensemble = list()
self.fastr_plugin = fastr_plugin
self.memory = memory
+ self.ensemble_validation_score = ensemble_validation_score
# Below are the defaults from sklearn
self.scoring = scoring
@@ -798,7 +804,7 @@ Source code for WORC.classification.SearchCV
[docs] def process_fit(self, n_splits, parameters_all,
test_sample_counts, test_score_dicts,
train_score_dicts, fit_time, score_time, cv_iter,
- X, y, fitted_workflows=None):
+ X, y, fitted_workflows=None, use_smac=False):
"""Process a fit.
Process the outcomes of a SearchCV fit and find the best settings
@@ -817,7 +823,11 @@ Source code for WORC.classification.SearchCV
# We take only one result per split, default by sklearn
pipelines_per_split = int(len(parameters_all) / n_splits)
- candidate_params_all = list(parameters_all[:pipelines_per_split])
+ # Change the list of parameters based on the shape of the input
+ if use_smac:
+ candidate_params_all = list(parameters_all[::n_splits])
+ else:
+ candidate_params_all = list(parameters_all[:pipelines_per_split])
n_candidates = len(candidate_params_all)
# Store some of the resulting scores
@@ -825,9 +835,13 @@ Source code for WORC.classification.SearchCV
# Computed the (weighted) mean and std for test scores alone
def _store(key_name, array, weights=None, splits=False, rank=False):
- """A small helper to store the scores/times to the cv_results_."""
- array = np.transpose(np.array(array, dtype=np.float64).reshape(n_splits,
- n_candidates))
+ """A small helper to store the scores/times to the cv_results_"""
+ # Change processing based on the shape of the input
+ if use_smac:
+ array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits)
+ else:
+ array = np.transpose(np.array(array, dtype=np.float64).reshape(n_splits,
+ n_candidates))
if splits:
for split_i in range(n_splits):
@@ -972,6 +986,9 @@ Source code for WORC.classification.SearchCV
n_candidates = len(candidate_params_all)
results['params'] = candidate_params_all
+ # Calculate and store the total_fit_time of this train/test CV
+ results['total_fit_time'] = np.sum(fit_time)
+
# Store the atributes of the best performing estimator
best_index = np.flatnonzero(results["rank_" + self.ranking_score] == 1)[0]
best_parameters_all = candidate_params_all[best_index]
@@ -1087,34 +1104,41 @@ Source code for WORC.classification.SearchCV
return self
-[docs] def create_ensemble(self, X_train, Y_train, verbose=None, initialize=True,
- scoring=None, method=50, overfit_scaler=False):
- """Create ensemble of multiple workflows.
+[docs] def create_ensemble(self, X_train, Y_train, verbose=None, initialize=False,
+ scoring=None, method='top_N', size=50, overfit_scaler=False):
+ """
Create an (optimal) ensemble of a combination of hyperparameter settings
and the associated groupsels, PCAs, estimators etc.
- Based on Caruana et al. 2004, but a little different:
-
- 1. Recreate the training/validation splits for a n-fold cross validation.
- 2. For each fold:
- a. Start with an empty ensemble
- b. Create starting ensemble by adding N individually best performing
- models on the validation set. N is tuned on the validation set.
- c. Add model that improves ensemble performance on validation set the most, with replacement.
- d. Repeat (c) untill performance does not increase
-
- The performance metric is the same as for the original hyperparameter
- search, i.e. probably the F1-score for classification and r2-score
- for regression. However, we recommend using the SAR score, as this is
- more universal.
-
- Method: top50 or Caruana
+ # The following ensemble methods are supported:
+ # Single:
+ # only use the single best classifier. Performance is computed
+ # using the same predict function as during the optimization
+ # top_N:
+ # make an ensemble of the best N individual classifiers, where N is
+ # given as an input. If N==1, then only the single best classifier is
+ # used, but it is evaluated using predict_proba.
+ # FitNumber:
+ # make an ensemble of the best N individual classifiers, choosing N
+ # that gives the highest performance
+ # ForwardSelection:
+ # add the model that optimizes the total ensemble performance,
+ # then repeat with replacement until there is no more improvement
+ # in performance
+ # Caruana:
+ # for a fixed number of iterations, add the model that optimizes
+ # the total ensemble performance, then choose the ensemble size
+ # which gave the best performance
+ # Bagging:
+ # same as Caruana method, but the final ensemble is a weighted average
+ # of a number of ensembles that each use only a subset of the available
+ # models
"""
# Define a function for scoring the performance of a classifier
def compute_performance(scoring, Y_valid_truth, Y_valid_score):
- if scoring == 'f1_weighted':
+ if scoring == 'f1_weighted' or scoring == 'f1':
# Convert score to binaries first
for num in range(0, len(Y_valid_score)):
if Y_valid_score[num] >= 0.5:
@@ -1157,109 +1181,134 @@ Source code for WORC.classification.SearchCV
base_estimator = RandomizedSearchCVfastr()
elif type(self) == RandomizedSearchCVJoblib:
base_estimator = RandomizedSearchCVJoblib()
-
- if type(method) is int:
- # Simply take the top50 best hyperparameters
- if verbose:
- print(f'Creating ensemble using top {str(method)} individual classifiers.')
- if method == 1:
- # Next functions expect list
- ensemble = [0]
- else:
- ensemble = range(0, method)
-
- elif method == 'FitNumber':
- # Use optimum number of models
-
- # In order to speed up the process, we precompute all scores of the possible
- # classifiers in all cross validation estimatons
-
- # Create the training and validation set scores
+ elif type(self) == GuidedSearchCVSMAC:
+ base_estimator = GuidedSearchCVSMAC()
+
+ if method == 'Single':
+ # Do not refit all the classifiers if we only need the best one
+ ensemble = [0]
+ elif method == 'top_N':
+ # Do not refit all the classifiers if we only need the best N
+ ensemble = range(0, size)
+ else:
+ # Refit the models and compute the predictions on the validation sets
if verbose:
- print('Precomputing scores on training and validation set.')
- Y_valid_score = list()
+ print('Precomputing scores on training and validation set for ensembling.')
Y_valid_truth = list()
- performances = np.zeros((n_iter, n_classifiers))
- for it, (train, valid) in enumerate(self.cv_iter):
- if verbose:
- print(f' - iteration {it + 1} / {n_iter}.')
- Y_valid_score_it = np.zeros((n_classifiers, len(valid)))
-
- # Loop over the 100 best estimators
- for num, p_all in enumerate(parameters_all):
- # NOTE: Explicitly exclude validation set, elso refit and score
- # somehow still seems to use it.
- X_train_temp = [X_train[i] for i in train]
- Y_train_temp = [Y_train[i] for i in train]
- train_temp = np.arange(0, len(train))
-
- # Refit a SearchCV object with the provided parameters
- base_estimator.refit_and_score(X_train_temp, Y_train_temp, p_all,
- train_temp, train_temp,
- verbose=False)
-
- # Predict and save scores
- X_train_values = [x[0] for x in X_train] # Throw away labels
- X_train_values_valid = [X_train_values[i] for i in valid]
- Y_valid_score_temp = base_estimator.predict_proba(X_train_values_valid)
-
- # Only take the probabilities for the second class
- Y_valid_score_temp = Y_valid_score_temp[:, 1]
-
- # Append to array for all classifiers on this validation set
- Y_valid_score_it[num, :] = Y_valid_score_temp
-
+ performances = list()
+ all_predictions = list()
+ ensemble_configurations = list()
+ prediction_length = len(self.cv_iter[0][1])
+ for num, p_all in enumerate(parameters_all):
+ performances_iter = list()
+ predictions_iter = np.zeros((n_iter, prediction_length))
+
+ for it, (train, valid) in enumerate(self.cv_iter):
+ predictions = list()
+ # Start with storing the ground truth
if num == 0:
- # Also store the validation ground truths
Y_valid_truth.append(Y_train[valid])
- performances[it, num] = compute_performance(scoring,
- Y_train[valid],
- Y_valid_score_temp)
-
- Y_valid_score.append(Y_valid_score_it)
-
- # Sorted Ensemble Initialization -------------------------------------
- # Go on adding to the ensemble untill we find the optimal performance
- # Initialize variables
-
- # Note: doing this in a greedy way doesnt work. We compute the
- # performances for the ensembles of lengt [1, n_classifiers] and
- # select the optimum
+ new_estimator = clone(base_estimator)
+
+ # Fit the preprocessors of the pipeline
+ out = fit_and_score(X_train, Y_train, scoring,
+ train, valid, p_all,
+ return_all=True)
+ (save_data, GroupSel, VarSel, SelectModel, feature_labels, scalers,
+ encoders, Imputers, PCAs, StatisticalSel, ReliefSel, Sampler) = out
+ new_estimator.best_groupsel = GroupSel
+ new_estimator.best_scaler = scalers
+ new_estimator.best_varsel = VarSel
+ new_estimator.best_modelsel = SelectModel
+ new_estimator.best_preprocessor = None
+ new_estimator.best_imputer = Imputers
+ new_estimator.best_encoder = encoders
+ new_estimator.best_pca = PCAs
+ new_estimator.best_featlab = feature_labels
+ new_estimator.best_statisticalsel = StatisticalSel
+ new_estimator.best_reliefsel = ReliefSel
+ new_estimator.best_Sampler = Sampler
+
+ # Use the fitted preprocessors to preprocess the features
+ X_train_values = np.asarray([x[0] for x in X_train])
+ processed_X, processed_y = new_estimator.preprocess(X_train_values[train],
+ Y_train[train],
+ training=True)
+ # Check if there are features left
+ (patients, features_left) = np.shape(processed_X)
+ if features_left == 0:
+ print('no features left' + '\n')
+ # No features are left; do not consider this pipeline for the ensemble
+ break
+ else:
+ # Construct and fit the classifier
+ best_estimator = cc.construct_classifier(p_all)
+ best_estimator.fit(processed_X, processed_y)
+ new_estimator.best_estimator_ = best_estimator
+ predictions = new_estimator.predict_proba(X_train_values[valid])
+
+ # Only take the probabilities for the second class
+ predictions = predictions[:, 1]
+
+ # Store the predictions on this split
+ #predictions_iter.append(predictions)
+ predictions_iter[it, :] = predictions
+
+ # Compute and store the performance on this split
+ performances_iter.append(compute_performance(scoring,
+ Y_train[valid],
+ predictions))
+
+ # print('fitandscore: ' + str(out[0][1]) + ' and computed: ' +
+ # str(compute_performance(scoring, Y_train[valid], predictions)) + '\n')
+
+ # At the end of the last iteration, store the results of this pipeline
+ if it == (n_iter - 1):
+ # Add the pipeline to the list
+ ensemble_configurations.append(p_all)
+ # Store the predictions
+ all_predictions.append(predictions_iter)
+ # Store the performance
+ performances.append(np.mean(performances_iter))
+
+ # Update the parameters
+ parameters_all = ensemble_configurations
+ n_classifiers = len(ensemble_configurations)
+ # Construct the array of final predictions
+ base_Y_valid_score = np.zeros((n_iter, n_classifiers, prediction_length))
+ for iter in range(n_iter):
+ for num in range(n_classifiers):
+ base_Y_valid_score[iter][num] = all_predictions[num][iter]
+
+ # Create the ensemble using the precomputed scores:
+
+ # Initialize the ensemble
+ ensemble = list()
+ # Initialize the stacked list of predictions that we keep for the ensemble
+ y_score = [None] * n_iter
best_performance = 0
new_performance = 0.001
+ single_estimator_performance = max(performances)
iteration = 0
- ensemble = list()
- y_score = [None]*n_iter
- best_index = 0
- single_estimator_performance = new_performance
- if initialize:
- # Rank the models based on scoring on the validation set
- performances = np.mean(performances, axis=0)
+ if method == 'FitNumber':
sortedindices = np.argsort(performances)[::-1]
performances_n_class = list()
if verbose:
print("\n")
- print('Sorted Ensemble Initialization.')
- # while new_performance > best_performance:
- for dummy in range(0, n_classifiers):
- # Score is better, so expand ensemble and replace new best score
- best_performance = new_performance
+ print('Creating ensemble with FitNumber method.')
+
+ for iteration in range(0, n_classifiers):
+ Y_valid_score = copy.deepcopy(base_Y_valid_score)
if iteration > 1:
- # Stack scores: not needed for first iteration
- ensemble.append(best_index)
- # N_models += 1
for num in range(0, n_iter):
y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))
elif iteration == 1:
# Create y_score object for second iteration
- single_estimator_performance = new_performance
- ensemble.append(best_index)
- # N_models += 1
for num in range(0, n_iter):
y_score[num] = Y_valid_score[num][ensemble[-1], :]
@@ -1281,7 +1330,7 @@ Source code for WORC.classification.SearchCV
new_performance = np.mean(performances_temp)
performances_n_class.append(new_performance)
best_index = sortedindices[iteration]
- iteration += 1
+ ensemble.append(best_index)
# Select N_models for initialization
new_performance = max(performances_n_class)
@@ -1289,203 +1338,224 @@ Source code for WORC.classification.SearchCV
ensemble = ensemble[0:N_models]
best_performance = new_performance
- # Print the performance gain
print(f"Ensembling best {scoring}: {best_performance}.")
print(f"Single estimator best {scoring}: {single_estimator_performance}.")
print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
- elif method == 'Caruana':
- # Use the method from Caruana
- if verbose:
- print('Creating ensemble with Caruana method.')
-
- # In order to speed up the process, we precompute all scores of the possible
- # classifiers in all cross validation estimatons
+ elif method == 'ForwardSelection':
- # Create the training and validation set scores
- if verbose:
- print('Precomputing scores on training and validation set.')
- Y_valid_score = list()
- Y_valid_truth = list()
- performances = np.zeros((n_iter, n_classifiers))
- for it, (train, valid) in enumerate(self.cv_iter):
if verbose:
- print(f' - iteration {it + 1} / {n_iter}.')
- Y_valid_score_it = np.zeros((n_classifiers, len(valid)))
-
- # Loop over the 100 best estimators
- for num, p_all in enumerate(parameters_all):
- # NOTE: Explicitly exclude validation set, elso refit and score
- # somehow still seems to use it.
- X_train_temp = [X_train[i] for i in train]
- Y_train_temp = [Y_train[i] for i in train]
- train_temp = np.arange(0, len(train))
-
- # Refit a SearchCV object with the provided parameters
- base_estimator.refit_and_score(X_train_temp, Y_train_temp, p_all,
- train_temp, train_temp,
- verbose=False)
-
- # Predict and save scores
- X_train_values = [x[0] for x in X_train] # Throw away labels
- X_train_values_valid = [X_train_values[i] for i in valid]
- Y_valid_score_temp = base_estimator.predict_proba(X_train_values_valid)
+ print('Creating ensemble with ForwardSelection method.')
- # Only take the probabilities for the second class
- Y_valid_score_temp = Y_valid_score_temp[:, 1]
+ while new_performance > best_performance:
+ Y_valid_score = copy.deepcopy(base_Y_valid_score)
+ if verbose:
+ print(f"Iteration: {iteration}, best {scoring}: {new_performance}.")
+ best_performance = new_performance
- # Append to array for all classifiers on this validation set
- Y_valid_score_it[num, :] = Y_valid_score_temp
+ if iteration > 1:
+ ensemble.append(best_index)
+ for num in range(0, n_iter):
+ y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))
- if num == 0:
- # Also store the validation ground truths
- Y_valid_truth.append(Y_train[valid])
+ elif iteration == 1:
+ # Create y_score object for second iteration
+ ensemble.append(best_index)
+ for num in range(0, n_iter):
+ y_score[num] = Y_valid_score[num][ensemble[-1], :]
- performances[it, num] = compute_performance(scoring,
- Y_train[valid],
- Y_valid_score_temp)
+ # Perform n-fold cross validation to estimate performance of each possible addition to ensemble
+ performances_temp = np.zeros((n_iter, n_classifiers))
+ for n_crossval in range(0, n_iter):
+ # For each estimator, add the score to the ensemble and new ensemble performance
+ for n_estimator in range(0, n_classifiers):
+ if iteration == 0:
+ # No y_score yet, so we need to build it instead of stacking
+ y_valid_score_new = Y_valid_score[n_crossval][n_estimator, :]
+ else:
+ # Stack scores of added model on top of previous scores and average
+ y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][n_estimator, :])), axis=0)
- Y_valid_score.append(Y_valid_score_it)
+ perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new)
+ performances_temp[n_crossval, n_estimator] = perf
- # Sorted Ensemble Initialization -------------------------------------
- # Go on adding to the ensemble untill we find the optimal performance
- # Initialize variables
+ # Average performances over crossval
+ performances_temp = list(np.mean(performances_temp, axis=0))
- # Note: doing this in a greedy way doesnt work. We compute the
- # performances for the ensembles of lengt [1, n_classifiers] and
- # select the optimum
- best_performance = 0
- new_performance = 0.001
- iteration = 0
- ensemble = list()
- y_score = [None]*n_iter
- best_index = 0
- single_estimator_performance = new_performance
+ # Check which ensemble should be in the ensemble to maximally improve
+ new_performance = max(performances_temp)
+ best_index = performances_temp.index(new_performance)
+ iteration += 1
- if initialize:
- # Rank the models based on scoring on the validation set
- performances = np.mean(performances, axis=0)
- sortedindices = np.argsort(performances)[::-1]
- performances_n_class = list()
+ # Print the performance gain
+ print(f"Ensembling best {scoring}: {best_performance}.")
+ print(f"Single estimator best {scoring}: {single_estimator_performance}.")
+ print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
+ elif method == 'Caruana':
if verbose:
- print("\n")
- print('Sorted Ensemble Initialization.')
- # while new_performance > best_performance:
- for dummy in range(0, n_classifiers):
- # Score is better, so expand ensemble and replace new best score
- best_performance = new_performance
+ print('Creating ensemble with Caruana method.')
+
+ best_ensemble_scores = list()
+
+ while iteration < 20:
+ Y_valid_score = copy.deepcopy(base_Y_valid_score)
+ if verbose:
+ print(f"Iteration: {iteration}, best {scoring}: {new_performance}.")
if iteration > 1:
# Stack scores: not needed for first iteration
- ensemble.append(best_index)
- # N_models += 1
for num in range(0, n_iter):
y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))
elif iteration == 1:
# Create y_score object for second iteration
- single_estimator_performance = new_performance
- ensemble.append(best_index)
- # N_models += 1
for num in range(0, n_iter):
y_score[num] = Y_valid_score[num][ensemble[-1], :]
- # Perform n-fold cross validation to estimate performance of next best classifier
- performances_temp = np.zeros((n_iter))
+ # Perform n-fold cross validation to estimate performance of each possible addition to ensemble
+ performances_temp = np.zeros((n_iter, n_classifiers))
for n_crossval in range(0, n_iter):
# For each estimator, add the score to the ensemble and new ensemble performance
- if iteration == 0:
- # No y_score yet, so we need to build it instead of stacking
- y_valid_score_new = Y_valid_score[n_crossval][sortedindices[iteration], :]
- else:
- # Stack scores of added model on top of previous scores and average
- y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][sortedindices[iteration], :])), axis=0)
+ for n_estimator in range(0, n_classifiers):
+ if iteration == 0:
+ # No y_score yet, so we need to build it instead of stacking
+ y_valid_score_new = Y_valid_score[n_crossval][n_estimator, :]
+ else:
+ # Stack scores of added model on top of previous scores and average
+ y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][n_estimator, :])), axis=0)
- perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new)
- performances_temp[n_crossval] = perf
+ perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new)
+ performances_temp[n_crossval, n_estimator] = perf
+
+ # Average performances over crossval
+ performances_temp = list(np.mean(performances_temp, axis=0))
# Check which ensemble should be in the ensemble to maximally improve
- new_performance = np.mean(performances_temp)
- performances_n_class.append(new_performance)
- best_index = sortedindices[iteration]
+ new_performance = max(performances_temp)
+ best_ensemble_scores.append(new_performance)
+ best_index = performances_temp.index(new_performance)
+ ensemble.append(best_index)
iteration += 1
- # Select N_models for initialization
- new_performance = max(performances_n_class)
- N_models = performances_n_class.index(new_performance) + 1 # +1 due to python indexing
- ensemble = ensemble[0:N_models]
- best_performance = new_performance
+ # Select the optimal ensemble size
+ optimal_ensemble_performance = max(best_ensemble_scores)
+ optimal_N_models = best_ensemble_scores.index(optimal_ensemble_performance) + 1
+ ensemble = ensemble[0:optimal_N_models]
+ best_performance = optimal_ensemble_performance
# Print the performance gain
print(f"Ensembling best {scoring}: {best_performance}.")
print(f"Single estimator best {scoring}: {single_estimator_performance}.")
print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
- # Greedy selection -----------------------------------------------
- # Initialize variables
- best_performance -= 1e-10
- iteration = 0
-
- # Go on adding to the ensemble untill we find the optimal performance
- if verbose:
- print("\n")
- print('Greedy selection.')
- while new_performance > best_performance:
- # Score is better, so expand ensemble and replace new best score
+ elif method == 'Bagging':
if verbose:
- print(f"Iteration: {iteration}, best {scoring}: {new_performance}.")
- best_performance = new_performance
+ print('Creating ensemble using Caruana with Bagging method.')
+
+ nr_of_bagging_iterations = size
+ for bag in range(nr_of_bagging_iterations):
+ bag_ensemble = list()
+ subset_size = int(np.floor(n_classifiers / 2))
+ model_subset = random.sample(range(n_classifiers), subset_size)
+
+ best_ensemble_scores = list()
+ iteration = 0
+
+ while iteration < 20:
+ Y_valid_score = copy.deepcopy(base_Y_valid_score)
+ if verbose:
+ print(f"Iteration: {iteration}, best {scoring}: {new_performance}.")
+
+ if iteration > 1:
+ for num in range(0, n_iter):
+ y_score[num] = np.vstack((y_score[num], Y_valid_score[num][bag_ensemble[-1], :]))
+
+ elif iteration == 1:
+ # Create y_score object for second iteration
+ for num in range(0, n_iter):
+ y_score[num] = Y_valid_score[num][bag_ensemble[-1], :]
+
+ # Perform n-fold cross validation to estimate performance of each possible addition to ensemble
+ performances_temp = np.zeros((n_iter, subset_size))
+ for n_crossval in range(0, n_iter):
+ # For each estimator, add the score to the ensemble and new ensemble performance
+ estimator_counter = 0
+ for n_estimator in model_subset:
+ if iteration == 0:
+ # No y_score yet, so we need to build it instead of stacking
+ y_valid_score_new = Y_valid_score[n_crossval][n_estimator, :]
+ else:
+ # Stack scores of added model on top of previous scores and average
+ y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][n_estimator, :])), axis=0)
+
+ perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new)
+ performances_temp[n_crossval, estimator_counter] = perf
+ estimator_counter += 1
+
+ # Average performances over crossval
+ performances_temp = list(np.mean(performances_temp, axis=0))
+
+ # Check which ensemble should be in the ensemble to maximally improve
+ new_performance = max(performances_temp)
+ best_ensemble_scores.append(new_performance)
+ best_index = performances_temp.index(new_performance)
+ bag_ensemble.append(best_index)
+ iteration += 1
+
+ # Select the optimal ensemble size
+ optimal_ensemble_performance = max(best_ensemble_scores)
+ optimal_N_models = best_ensemble_scores.index(optimal_ensemble_performance) + 1
+ # Add the best ensemble of this bagging iteration to the final ensemble
+ bag_ensemble = bag_ensemble[0:optimal_N_models]
+ for model in bag_ensemble:
+ ensemble.append(model)
+ best_performance = optimal_ensemble_performance
- if iteration > 1:
- # Stack scores: not needed for first iteration
- ensemble.append(best_index)
- for num in range(0, n_iter):
- y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))
+ # Print the performance gain
+ print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
- elif iteration == 1:
- if not initialize:
- # Create y_score object for second iteration
- single_estimator_performance = new_performance
- ensemble.append(best_index)
- for num in range(0, n_iter):
- y_score[num] = Y_valid_score[num][ensemble[-1], :]
- else:
- # Stack scores: not needed when ensemble initialization is already used
- ensemble.append(best_index)
- for num in range(0, n_iter):
- y_score[num] = np.vstack((y_score[num], Y_valid_score[num][ensemble[-1], :]))
+ else:
+ print(f'[WORC WARNING] No valid ensemble method given: {method}. Not ensembling')
+ return self
- # Perform n-fold cross validation to estimate performance of each possible addition to ensemble
- performances_temp = np.zeros((n_iter, n_classifiers))
- for n_crossval in range(0, n_iter):
- # For each estimator, add the score to the ensemble and new ensemble performance
- for n_estimator in range(0, n_classifiers):
- if iteration == 0:
- # No y_score yet, so we need to build it instead of stacking
- y_valid_score_new = Y_valid_score[n_crossval][n_estimator, :]
- else:
- # Stack scores of added model on top of previous scores and average
- y_valid_score_new = np.mean(np.vstack((y_score[n_crossval], Y_valid_score[n_crossval][n_estimator, :])), axis=0)
+ # Create the ensemble --------------------------------------------------
- perf = compute_performance(scoring, Y_valid_truth[n_crossval], y_valid_score_new)
- performances_temp[n_crossval, n_estimator] = perf
+ # First create and score the ensemble on the validation set
+ # If we only want the best solution, we use the score from cv_results_
+ if method == 'Single':
+ self.ensemble_validation_score = self.cv_results_['mean_test_score'][0]
+ elif method == 'top_N':
+ self.ensemble_validation_score = [self.cv_results_['mean_test_score'][i] for i in ensemble]
+ else:
+ selected_params = [parameters_all[i] for i in ensemble]
+ val_split_scores = []
+ for train, valid in self.cv_iter:
+ estimators = list()
+ for enum, p_all in enumerate(selected_params):
+ new_estimator = clone(base_estimator)
+
+ new_estimator.refit_and_score(X_train, Y_train, p_all,
+ train, valid,
+ verbose=False)
- # Average performances over crossval
- performances_temp = list(np.mean(performances_temp, axis=0))
+ estimators.append(new_estimator)
- # Check which ensemble should be in the ensemble to maximally improve
- new_performance = max(performances_temp)
- best_index = performances_temp.index(new_performance)
- iteration += 1
+ new_estimator = clone(base_estimator)
+ new_estimator.ensemble = Ensemble(estimators)
+ new_estimator.best_estimator_ = new_estimator.ensemble
+ # Calculate and store the final performance of the ensemble
+ # on this validation split
+ X_train_values = np.asarray([x[0] for x in X_train])
+ predictions = new_estimator.predict(X_train_values[valid])
+ val_split_scores.append(compute_performance(scoring,
+ Y_train[valid],
+ predictions))
- # Print the performance gain
- print(f"Ensembling best {scoring}: {best_performance}.")
- print(f"Single estimator best {scoring}: {single_estimator_performance}.")
- print(f'Ensemble consists of {len(ensemble)} estimators {ensemble}.')
- else:
- print(f'[WORC WARNING] No valid ensemble method given: {method}. Not ensembling')
- return self
+ validation_score = np.mean(val_split_scores)
+ self.ensemble_validation_score = validation_score
+
+ print('Final ensemble validation score: ' + str(self.ensemble_validation_score))
# Create the ensemble --------------------------------------------------
train = np.arange(0, len(X_train))
@@ -1505,6 +1575,7 @@ Source code for WORC.classification.SearchCV
parameters_all[i],
train, train,
verbose=False)
+
estimators.append(estimator)
else:
# Create the ensemble trained on the full training set
@@ -1513,10 +1584,10 @@ Source code for WORC.classification.SearchCV
nest = len(ensemble)
for enum, p_all in enumerate(parameters_all):
# Refit a SearchCV object with the provided parameters
- print(f"Refitting estimator {enum+1} / {nest}.")
+ print(f"Refitting estimator {enum + 1} / {nest}.")
base_estimator = clone(base_estimator)
- # # Check if we need to create a multiclass estimator
+ # Check if we need to create a multiclass estimator
base_estimator.refit_and_score(X_train, Y_train, p_all,
train, train,
verbose=False)
@@ -1524,10 +1595,43 @@ Source code for WORC.classification.SearchCV
# Determine whether to overfit the feature scaling on the test set
base_estimator.overfit_scaler = overfit_scaler
- estimators.append(base_estimator)
+ try:
+ # Try a prediction to see if estimator is truly fitted
+ base_estimator.predict(np.asarray([X_train[0][0], X_train[1][0]]))
+ estimators.append(base_estimator)
+ except (NotFittedError, ValueError):
+ print(f'\t\t - Estimator {enum} could not be fitted (correctly), do not include in ensemble.')
+ if enum + 1 == nest and not estimators:
+ print(f'\t\t - Reached end of ensemble ({enum + 1}), but ensemble is empty, thus go on untill we find an estimator that works')
+ while not estimators:
+ # We cannot have an empy ensemble, thus go on untill we find an estimator that works
+ enum += 1
+ p_all = self.cv_results_['params'][enum]
+
+ # Refit a SearchCV object with the provided parameters
+ base_estimator = clone(base_estimator)
+
+ # Check if we need to create a multiclass estimator
+ base_estimator.refit_and_score(X_train, Y_train, p_all,
+ train, train,
+ verbose=False)
+
+ # Determine whether to overfit the feature scaling on the test set
+ base_estimator.overfit_scaler = overfit_scaler
+
+ try:
+ # Try a prediction to see if estimator is truly fitted
+ base_estimator.predict(np.asarray([X_train[0][0], X_train[1][0]]))
+ estimators.append(base_estimator)
+ except (NotFittedError, ValueError):
+ pass
+ print(f'\t\t - Needed estimator {enum}.')
+ else:
+ pass
self.ensemble = Ensemble(estimators)
self.best_estimator_ = self.ensemble
+
print("\n")
@@ -2885,6 +2989,462 @@ Source code for WORC.classification.SearchCV
train/test set.
"""
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
+
+
+[docs]class BaseSearchCVSMAC(BaseSearchCV):
+ """Base class for Bayesian hyper parameter search with cross-validation."""
+
+ def _fit(self, groups):
+ """Actual fitting, performing the search over parameters."""
+
+ regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']
+ isclassifier = \
+ not any(clf in regressors for clf in self.param_distributions['Classification']['classifiers'])
+
+ cv = check_cv(self.cv, self.labels, classifier=isclassifier)
+
+ self.features, self.labels, groups = indexable(self.features, self.labels, groups)
+ n_splits = cv.get_n_splits(self.features, self.labels, groups)
+
+ pre_dispatch = self.pre_dispatch
+ cv_iter = list(cv.split(self.features, self.labels, groups))
+
+ # Build the SMAC configuration
+ self.param_distributions['Other'] = dict()
+ self.param_distributions['Other']['random_seed'] = np.random.randint(1, 5000)
+ cs = build_smac_config(self.param_distributions)
+
+ # Run the optimization
+
+ # Here we will create and execute a fastr network
+
+ # Create temporary directory for fastr
+ if DebugDetector().do_detection():
+ # Specific name for easy debugging
+ debugnum = 0
+ name = 'DEBUG_' + str(debugnum)
+ tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
+ while os.path.exists(tempfolder):
+ debugnum += 1
+ name = 'DEBUG_' + str(debugnum)
+ tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
+
+ else:
+ name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))
+
+ tempfolder = os.path.join(fastr.config.mounts['tmp'], 'GS', name)
+ if not os.path.exists(tempfolder):
+ os.makedirs(tempfolder)
+
+ # Create the files containing the estimator and settings
+ estimator_labels = ['X', 'y', 'search_space', 'cv_iter', 'scoring',
+ 'verbose', 'fit_params', 'return_train_score',
+ 'return_n_test_samples',
+ 'return_times', 'return_parameters',
+ 'error_score', 'budget_type', 'budget',
+ 'init_method', 'init_budget', 'smac_result_file']
+
+ estimator_data = pd.Series([self.features, self.labels, cs,
+ cv_iter, self.scoring, False,
+ self.fit_params, self.return_train_score,
+ True, True, True,
+ self.error_score,
+ self.param_distributions['SMAC']['budget_type'],
+ self.param_distributions['SMAC']['budget'],
+ self.param_distributions['SMAC']['init_method'],
+ self.param_distributions['SMAC']['init_budget'],
+ self.smac_result_file],
+ index=estimator_labels,
+ name='estimator Data')
+
+ fname = 'estimatordata.hdf5'
+ estimatorname = os.path.join(tempfolder, fname)
+ estimator_data.to_hdf(estimatorname, 'Estimator Data')
+
+ estimatordata = f"vfs://tmp/GS/{name}/{fname}"
+
+ # Create the files containing the instance data
+ instance_labels = ['run_id', 'run_rng', 'run_name', 'tempfolder']
+ current_date_time = datetime.now()
+ random_id = random.randint(1000, 9999)
+ run_name = current_date_time.strftime('smac-run_' + '%m-%d_%H-%M-%S' + str(random_id))
+ instance_files = dict()
+ for i in range(self.param_distributions['SMAC']['n_smac_cores']):
+ instance_info = [i, random.randint(0, 2 ** 32 - 1), run_name, tempfolder]
+ instance_data = pd.Series(instance_info,
+ index=instance_labels,
+ name=f'instance data {i}')
+ fname = f'instancedata_{i}.hdf5'
+ instancefolder = os.path.join(tempfolder, 'instances', fname)
+ if not os.path.exists(os.path.dirname(instancefolder)):
+ os.makedirs(os.path.dirname(instancefolder))
+ instance_data.to_hdf(instancefolder, 'Instance Data')
+ instancedata = f'vfs://tmp/GS/{name}/instances/{fname}'
+ instance_files[f'{i}'] = instancedata
+
+ # Create the fastr network
+ network = fastr.create_network('WORC_SMAC_' + name)
+ estimator_data = network.create_source('HDF5', id='estimator_source')
+ instance_data = network.create_source('HDF5', id='instance_source')
+ sink_output = network.create_sink('HDF5', id='output')
+
+ smac_node = network.create_node('worc/smac:1.0', tool_version='1.0', id='smac',
+ resources=ResourceLimit(memory='5G'))
+
+ smac_node.inputs['estimatordata'] = estimator_data.output
+ smac_node.inputs['instancedata'] = instance_data.output
+ sink_output.input = smac_node.outputs['fittedestimator']
+
+ source_data = {'estimator_source': estimatordata,
+ 'instance_source': instance_files}
+
+ sink_data = {'output': f"vfs://tmp/GS/{name}/output_{{sample_id}}_{{cardinality}}{{ext}}"}
+
+ network.execute(source_data, sink_data,
+ tmpdir=os.path.join(tempfolder, 'tmp'),
+ execution_plugin=self.fastr_plugin)
+
+ # Check whether all jobs have finished
+ expected_no_files = len(instance_files)
+ sink_files = glob.glob(os.path.join(fastr.config.mounts['tmp'], 'GS', name) + '/output*.hdf5')
+ if len(sink_files) != expected_no_files:
+ difference = expected_no_files - len(sink_files)
+ fname = os.path.join(tempfolder, 'tmp')
+ message = ('Fitting classifiers has failed for ' +
+ f'{difference} / {expected_no_files} files. The temporary ' +
+ f'results where not deleted and can be found in {tempfolder}. ' +
+ 'Probably your fitting and scoring failed: check out ' +
+ 'the tmp/smac folder within the tempfolder for ' +
+ 'the fastr job temporary results or run: fastr trace ' +
+ f'"{fname}{os.path.sep}__sink_data__.json" --samples.')
+ raise WORCexceptions.WORCValueError(message)
+
+ # Read in the output data once finished
+ save_data = list()
+ for output in sink_files:
+ data = pd.read_hdf(output)
+ save_data.extend(list(data['RET']))
+
+ # if one choose to see train score, "out" will contain train score info
+ if self.return_train_score:
+ (train_scores, test_scores, test_sample_counts,
+ fit_time, score_time, parameters_est, parameters_all) = \
+ zip(*save_data)
+ else:
+ (test_scores, test_sample_counts,
+ fit_time, score_time, parameters_est, parameters_all) = \
+ zip(*save_data)
+
+ # Process the smac_results data once finished
+ # First read in the results of all smac instance files
+ smac_filenames = glob.glob(os.path.join(tempfolder,
+ 'tested_configs',
+ run_name) + '/smac_stats_*.json')
+ # Then create a combined dictionary with all
+ # results of this cross-validation split and
+ # a summary
+ smac_results_for_this_cv = dict()
+ smac_results_for_this_cv[run_name] = dict()
+ summary = dict()
+ all_costs = []
+ best_cost = 1
+ all_runtimes = []
+ for fn in smac_filenames:
+ with open(fn, 'r') as f:
+ smac_result = json.load(f)
+ run_data = smac_result[list(smac_result.keys())[0]]
+ nr_of_inc_updates = run_data['inc_changed']
+ current_cost = run_data['inc_costs'][nr_of_inc_updates - 1]
+ all_costs.append(current_cost)
+ all_runtimes.append(run_data['wallclock_time_used'])
+ if current_cost < best_cost:
+ best_cost = current_cost
+ summary['best_score'] = current_cost
+ summary['best_inc_wallclock_time'] = run_data['inc_wallclock_times'][nr_of_inc_updates - 1]
+ summary['best_inc_evals'] = run_data['inc_evaluations'][nr_of_inc_updates - 1]
+ summary['best_inc_changed'] = run_data['inc_changed']
+ summary['best_config'] = run_data['inc_configs'][nr_of_inc_updates - 1]
+ smac_results_for_this_cv[run_name].update(smac_result)
+ summary['average_score'] = np.mean(all_costs)
+ summary['std_score'] = np.std(all_costs)
+ summary['shortest_runtime'] = np.min(all_runtimes)
+ summary['longest_runtime'] = np.max(all_runtimes)
+ summary['average_runtime'] = np.mean(all_runtimes)
+ summary['total_runtime'] = np.sum(all_runtimes)
+ final_summary = {'cv-summary': summary}
+ smac_results_for_this_cv[run_name].update(final_summary)
+
+ result_file = self.smac_result_file
+
+ if os.path.exists(result_file):
+ with open(result_file, 'r') as jsonfile:
+ results_so_far = json.load(jsonfile)
+ results_so_far.update(smac_results_for_this_cv)
+ with open(result_file, 'w') as jsonfile:
+ json.dump(results_so_far, jsonfile, indent=4)
+ else:
+ with open(result_file, 'a') as jsonfile:
+ json.dump(smac_results_for_this_cv, jsonfile, indent=4)
+
+ # Remove the temporary folder used
+ if name != 'DEBUG_0':
+ # Do delete if not debugging for first iteration
+ shutil.rmtree(tempfolder)
+
+ # Process the results of the fitting procedure
+ self.process_fit(n_splits=n_splits,
+ parameters_all=parameters_all,
+ test_sample_counts=test_sample_counts,
+ test_score_dicts=test_scores,
+ train_score_dicts=train_scores,
+ fit_time=fit_time,
+ score_time=score_time,
+ cv_iter=cv_iter,
+ X=self.features, y=self.labels,
+ use_smac=True)
+
+ return self
+
+
+[docs]class GuidedSearchCVSMAC(BaseSearchCVSMAC):
+ """Guided search on hyperparameters.
+
+ GuidedSearchCV implements a "fit" and a "score" method.
+ It also implements "predict", "predict_proba", "decision_function",
+ "transform" and "inverse_transform" if they are implemented in the
+ estimator used.
+
+ The parameters of the estimator used to apply these methods are optimized
+ by cross-validated search over parameter settings.
+
+ The optimization is performed using the Sequential Model-based Algorithm
+ Configuration (SMAC) method. A probabilistic model of the objective function
+ is constructed and updated with each function evaluation.
+
+ If all parameters are presented as a list,
+ sampling without replacement is performed. If at least one parameter
+ is given as a distribution, sampling with replacement is used.
+ It is highly recommended to use continuous distributions for continuous
+ parameters.
+
+ Parameters
+ ----------
+ param_distributions : dict
+ Dictionary with parameter names (string) as keys and details of their
+ domains as values. From this dictionary the complete search space
+ will later be constructed.
+
+ n_iter : int, default=10
+ Number of function evaluations allowed in each optimization sequence
+ of SMAC.
+
+ scoring : string, callable or None, default=None
+ A string (see model evaluation documentation) or
+ a scorer callable object / function with signature
+ ``scorer(estimator, X, y)``.
+ If ``None``, the ``score`` method of the estimator is used.
+
+ fit_params : dict, optional
+ Parameters to pass to the fit method.
+
+ n_jobs : int, default=1
+ Number of jobs to run in parallel.
+
+ pre_dispatch : int, or string, optional
+ Controls the number of jobs that get dispatched during parallel
+ execution. Reducing this number can be useful to avoid an
+ explosion of memory consumption when more jobs get dispatched
+ than CPUs can process. This parameter can be:
+
+ - None, in which case all the jobs are immediately
+ created and spawned. Use this for lightweight and
+ fast-running jobs, to avoid delays due to on-demand
+ spawning of the jobs
+
+ - An int, giving the exact number of total jobs that are
+ spawned
+
+ - A string, giving an expression as a function of n_jobs,
+ as in '2*n_jobs'
+
+ iid : boolean, default=True
+ If True, the data is assumed to be identically distributed across
+ the folds, and the loss minimized is the total loss per sample,
+ and not the mean loss across the folds.
+
+ cv : int, cross-validation generator or an iterable, optional
+ Determines the cross-validation splitting strategy.
+ Possible inputs for cv are:
+ - None, to use the default 3-fold cross validation,
+ - integer, to specify the number of folds in a `(Stratified)KFold`,
+ - An object to be used as a cross-validation generator.
+ - An iterable yielding train, test splits.
+
+ For integer/None inputs, if the estimator is a classifier and ``y`` is
+ either binary or multiclass, :class:`StratifiedKFold` is used. In all
+ other cases, :class:`KFold` is used.
+
+ Refer :ref:`User Guide <cross_validation>` for the various
+ cross-validation strategies that can be used here.
+
+ refit : boolean, default=True
+ Refit the best estimator with the entire dataset.
+ If "False", it is impossible to make predictions using
+ this RandomizedSearchCV instance after fitting.
+
+ verbose : integer
+ Controls the verbosity: the higher, the more messages.
+
+ random_state : int or RandomState
+ Pseudo random number generator state used for random uniform sampling
+ from lists of possible values instead of scipy.stats distributions.
+
+ error_score : 'raise' (default) or numeric
+ Value to assign to the score if an error occurs in estimator fitting.
+ If set to 'raise', the error is raised. If a numeric value is given,
+ FitFailedWarning is raised. This parameter does not affect the refit
+ step, which will always raise the error.
+
+ return_train_score : boolean, default=True
+ If ``'False'``, the ``cv_results_`` attribute will not include training
+ scores.
+
+ Attributes
+ ----------
+ cv_results_ : dict of numpy (masked) ndarrays
+ A dict with keys as column headers and values as columns, that can be
+ imported into a pandas ``DataFrame``.
+
+ For instance the below given table
+
+ +--------------+-------------+-------------------+---+---------------+
+ | param_kernel | param_gamma | split0_test_score |...|rank_test_score|
+ +==============+=============+===================+===+===============+
+ | 'rbf' | 0.1 | 0.8 |...| 2 |
+ +--------------+-------------+-------------------+---+---------------+
+ | 'rbf' | 0.2 | 0.9 |...| 1 |
+ +--------------+-------------+-------------------+---+---------------+
+ | 'rbf' | 0.3 | 0.7 |...| 1 |
+ +--------------+-------------+-------------------+---+---------------+
+
+ will be represented by a ``cv_results_`` dict of::
+
+ {
+ 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
+ mask = False),
+ 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False),
+ 'split0_test_score' : [0.8, 0.9, 0.7],
+ 'split1_test_score' : [0.82, 0.5, 0.7],
+ 'mean_test_score' : [0.81, 0.7, 0.7],
+ 'std_test_score' : [0.02, 0.2, 0.],
+ 'rank_test_score' : [3, 1, 1],
+ 'split0_train_score' : [0.8, 0.9, 0.7],
+ 'split1_train_score' : [0.82, 0.5, 0.7],
+ 'mean_train_score' : [0.81, 0.7, 0.7],
+ 'std_train_score' : [0.03, 0.03, 0.04],
+ 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49],
+ 'std_fit_time' : [0.01, 0.02, 0.01, 0.01],
+ 'mean_score_time' : [0.007, 0.06, 0.04, 0.04],
+ 'std_score_time' : [0.001, 0.002, 0.003, 0.005],
+ 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
+ }
+
+ NOTE that the key ``'params'`` is used to store a list of parameter
+ settings dict for all the parameter candidates.
+
+ The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
+ ``std_score_time`` are all in seconds.
+
+ best_estimator_ : estimator
+ Estimator that was chosen by the search, i.e. estimator
+ which gave highest score (or smallest loss if specified)
+ on the left out data. Not available if refit=False.
+
+ best_score_ : float
+ Score of best_estimator on the left out data.
+
+ best_params_ : dict
+ Parameter setting that gave the best results on the hold out data.
+
+ best_index_ : int
+ The index (of the ``cv_results_`` arrays) which corresponds to the best
+ candidate parameter setting.
+
+ The dict at ``search.cv_results_['params'][search.best_index_]`` gives
+ the parameter setting for the best model, that gives the highest
+ mean score (``search.best_score_``).
+
+ scorer_ : function
+ Scorer function used on the held out data to choose the best
+ parameters for the model.
+
+ n_splits_ : int
+ The number of cross-validation splits (folds/iterations).
+
+ Notes
+ -----
+ The parameters selected are those that maximize the score of the held-out
+ data, according to the scoring parameter.
+
+ If `n_jobs` was set to a value higher than one, the data is copied for each
+ parameter setting(and not `n_jobs` times). This is done for efficiency
+ reasons if individual jobs take very little time, but may raise errors if
+ the dataset is large and not enough memory is available. A workaround in
+ this case is to set `pre_dispatch`. Then, the memory is copied only
+ `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
+ n_jobs`.
+
+ See Also
+ --------
+ :class:`GridSearchCV`:
+ Does exhaustive search over a grid of parameters.
+
+ :class:`ParameterSampler`:
+ A generator over parameter settings, constructed from
+ param_distributions.
+
+ """
+
+[docs] def __init__(self, param_distributions={}, n_iter=10, scoring=None,
+ fit_params=None, n_jobs=1, iid=True, refit=True, cv=None,
+ verbose=0, pre_dispatch='2*n_jobs', random_state=None,
+ error_score='raise', return_train_score=True,
+ n_jobspercore=100, fastr_plugin=None, maxlen=100,
+ ranking_score='test_score', features=None, labels=None,
+ smac_result_file=None):
+ super(GuidedSearchCVSMAC, self).__init__(
+ param_distributions=param_distributions, scoring=scoring, fit_params=fit_params,
+ n_iter=n_iter, random_state=random_state, n_jobs=n_jobs, iid=iid, refit=refit, cv=cv, verbose=verbose,
+ pre_dispatch=pre_dispatch, error_score=error_score,
+ return_train_score=return_train_score,
+ n_jobspercore=n_jobspercore, fastr_plugin=fastr_plugin,
+ maxlen=maxlen, ranking_score=ranking_score)
+ self.features = features
+ self.labels = labels
+ self.smac_result_file = smac_result_file
+
+[docs] def fit(self, X, y=None, groups=None):
+ """Run fit on the estimator with randomly drawn parameters.
+
+ Parameters
+ ----------
+ X : array-like, shape = [n_samples, n_features]
+ Training vector, where n_samples in the number of samples and
+ n_features is the number of features.
+
+ y : array-like, shape = [n_samples] or [n_samples, n_output], optional
+ Target relative to X for classification or regression;
+ None for unsupervised learning.
+
+ groups : array-like, with shape (n_samples,), optional
+ Group labels for the samples used while splitting the dataset into
+ train/test set.
+ """
+ print("Fit: " + str(self.n_iter))
+ self.features = X
+ self.labels = y
+
+ return self._fit(groups)
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/construct_classifier.html b/WORC/doc/_build/html/_modules/WORC/classification/construct_classifier.html
index 141a37e3..32253edb 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/construct_classifier.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/construct_classifier.html
@@ -8,7 +8,7 @@
- WORC.classification.construct_classifier — WORC 3.5.0 documentation
+ WORC.classification.construct_classifier — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -165,7 +165,7 @@
Source code for WORC.classification.construct_classifier
#!/usr/bin/env python
-# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
+# Copyright 2016-2022 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -195,6 +195,12 @@ Source code for WORC.classification.construct_classifier
import WORC.addexceptions as ae
from xgboost import XGBClassifier, XGBRegressor
+try:
+ from lightgbm import LGBMClassifier
+except:
+ print("[INFO] LightGCM classifier currently not available. Please see https://worc.readthedocs.io/en/latest/static/additionalfunctionality.html.")
+
+
[docs]def construct_classifier(config):
"""Interface to create classification.
@@ -270,6 +276,23 @@ Source code for WORC.classification.construct_classifier
colsample_bytree=colsample_bytree,
random_state=config['random_seed'])
+ elif config['classifiers'] == 'LightGBMClassifier':
+ # LightGBM Classifier
+ num_leaves = config['LightGBM_num_leaves']
+ max_depth = config['LightGBM_max_depth']
+ min_child_samples = config['LightGBM_min_child_samples']
+ reg_alpha = config['LightGBM_reg_alpha']
+ reg_lambda = config['LightGBM_reg_lambda']
+ min_child_weight = config['LightGBM_min_child_weight']
+
+ classifier = LGBMClassifier(num_leaves=num_leaves,
+ max_depth=max_depth,
+ min_child_samples=min_child_samples,
+ reg_alpha=reg_alpha,
+ reg_lambda=reg_lambda,
+ min_child_weight=min_child_weight,
+ random_state=config['random_seed'])
+
elif config['classifiers'] == 'RF':
# Random forest kernel
classifier = RandomForestClassifier(verbose=0,
@@ -390,13 +413,18 @@ Source code for WORC.classification.construct_classifier
clf = SVC(class_weight='balanced', probability=True, max_iter=max_iter,
random_state=config['random_seed'])
else:
+ # NOTE: SVMR has no random state
clf = SVMR(max_iter=max_iter)
clf.kernel = str(config['SVMKernel'])
clf.C = config['SVMC']
- clf.degree = config['SVMdegree']
- clf.coef0 = config['SVMcoef0']
- clf.gamma = config['SVMgamma']
+ # Only add the following parameters if they are defined
+ if 'SVMdegree' in config:
+ clf.degree = config['SVMdegree']
+ if 'SVMcoef0' in config:
+ clf.coef0 = config['SVMcoef0']
+ if 'SVMgamma' in config:
+ clf.gamma = config['SVMgamma']
return clf
@@ -508,6 +536,31 @@ Source code for WORC.classification.construct_classifier
scipy.stats.uniform(loc=config['XGB_colsample_bytree'][0],
scale=config['XGB_colsample_bytree'][1])
+ # LightGBM
+ param_grid['LightGBM_num_leaves'] =\
+ discrete_uniform(loc=config['LightGBM_num_leaves'][0],
+ scale=config['LightGBM_num_leaves'][1])
+
+ param_grid['LightGBM_max_depth'] =\
+ discrete_uniform(loc=config['LightGBM_max_depth'][0],
+ scale=config['LightGBM_max_depth'][1])
+
+ param_grid['LightGBM_min_child_samples'] =\
+ discrete_uniform(loc=config['LightGBM_min_child_samples'][0],
+ scale=config['LightGBM_min_child_samples'][1])
+
+ param_grid['LightGBM_reg_alpha'] =\
+ scipy.stats.uniform(loc=config['LightGBM_reg_alpha'][0],
+ scale=config['LightGBM_reg_alpha'][1])
+
+ param_grid['LightGBM_reg_lambda'] =\
+ scipy.stats.uniform(loc=config['LightGBM_reg_lambda'][0],
+ scale=config['LightGBM_reg_lambda'][1])
+
+ param_grid['LightGBM_min_child_weight'] =\
+ log_uniform(loc=config['LightGBM_min_child_weight'][0],
+ scale=config['LightGBM_min_child_weight'][1])
+
return param_grid
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/createfixedsplits.html b/WORC/doc/_build/html/_modules/WORC/classification/createfixedsplits.html
index e0d020d0..14a5ef20 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/createfixedsplits.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/createfixedsplits.html
@@ -8,7 +8,7 @@
- WORC.classification.createfixedsplits — WORC 3.5.0 documentation
+ WORC.classification.createfixedsplits — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/crossval.html b/WORC/doc/_build/html/_modules/WORC/classification/crossval.html
index 5212ca1b..a13806a8 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/crossval.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/crossval.html
@@ -8,7 +8,7 @@
- WORC.classification.crossval — WORC 3.5.0 documentation
+ WORC.classification.crossval — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -187,7 +187,7 @@ Source code for WORC.classification.crossval
import time
from time import gmtime, strftime
from sklearn.model_selection import train_test_split, LeaveOneOut
-from .parameter_optimization import random_search_parameters
+from .parameter_optimization import random_search_parameters, guided_search_parameters
import WORC.addexceptions as ae
from WORC.classification.regressors import regressors
import glob
@@ -205,7 +205,8 @@ Source code for WORC.classification.crossval
fixedsplits=None,
fixed_seed=False, use_fastr=None,
fastr_plugin=None,
- do_test_RS_Ensemble=False):
+ do_test_RS_Ensemble=False,
+ use_SMAC=False, smac_result_file=None):
"""Cross-validation in which data is randomly split in each iteration.
Due to options of doing single-label and multi-label classification,
@@ -229,6 +230,7 @@ Source code for WORC.classification.crossval
if fixedsplits is not None:
n_iterations = int(fixedsplits.columns.shape[0] / 2)
print(f'Fixedsplits detected, adjusting n_iterations to {n_iterations}')
+ logging.debug(f'Fixedsplits detected, adjusting n_iterations to {n_iterations}')
for i in range(start, n_iterations):
print(('Cross-validation iteration {} / {} .').format(str(i + 1), str(n_iterations)))
@@ -376,12 +378,21 @@ Source code for WORC.classification.crossval
config['HyperOptimization']['use_fastr'] = use_fastr
config['HyperOptimization']['fastr_plugin'] = fastr_plugin
n_cores = config['General']['Joblib_ncores']
- trained_classifier = random_search_parameters(features=X_train,
- labels=Y_train,
- param_grid=param_grid,
- n_cores=n_cores,
- random_seed=random_seed,
- **config['HyperOptimization'])
+ if use_SMAC:
+ trained_classifier = guided_search_parameters(features=X_train,
+ labels=Y_train,
+ parameters=config,
+ n_cores=n_cores,
+ random_seed=random_seed,
+ smac_result_file=smac_result_file,
+ **config['HyperOptimization'])
+ else:
+ trained_classifier = random_search_parameters(features=X_train,
+ labels=Y_train,
+ param_grid=param_grid,
+ n_cores=n_cores,
+ random_seed=random_seed,
+ **config['HyperOptimization'])
# We only want to save the feature values and one label array
X_train = [x[0] for x in X_train]
@@ -442,7 +453,8 @@ Source code for WORC.classification.crossval
modus, test_size, start=0, save_data=None,
tempsave=False, tempfolder=None, fixedsplits=None,
fixed_seed=False, use_fastr=None,
- fastr_plugin=None):
+ fastr_plugin=None,
+ use_SMAC=False, smac_result_file=None):
"""Cross-validation in which each sample is once used as the test set.
Mostly based on the default sklearn object.
@@ -511,12 +523,21 @@ Source code for WORC.classification.crossval
config['HyperOptimization']['use_fastr'] = use_fastr
config['HyperOptimization']['fastr_plugin'] = fastr_plugin
n_cores = config['General']['Joblib_ncores']
- trained_classifier = random_search_parameters(features=X_train,
- labels=Y_train,
- param_grid=param_grid,
- n_cores=n_cores,
- random_seed=random_seed,
- **config['HyperOptimization'])
+ if use_SMAC:
+ trained_classifier = guided_search_parameters(features=X_train,
+ labels=Y_train,
+ parameters=config,
+ n_cores=n_cores,
+ random_seed=random_seed,
+ smac_result_file=smac_result_file,
+ **config['HyperOptimization'])
+ else:
+ trained_classifier = random_search_parameters(features=X_train,
+ labels=Y_train,
+ param_grid=param_grid,
+ n_cores=n_cores,
+ random_seed=random_seed,
+ **config['HyperOptimization'])
# We only want to save the feature values and one label array
X_train = [x[0] for x in X_train]
@@ -563,7 +584,7 @@ Source code for WORC.classification.crossval
param_grid=None, use_fastr=False,
fastr_plugin=None, tempsave=False,
fixedsplits=None, ensemble={'Use': False}, outputfolder=None,
- modus='singlelabel'):
+ modus='singlelabel', use_SMAC=False, smac_result_file=None):
"""Constructs multiple individual classifiers based on the label settings.
Parameters
@@ -731,7 +752,9 @@ Source code for WORC.classification.crossval
fixedsplits=fixedsplits,
fixed_seed=fixed_seed,
use_fastr=use_fastr,
- fastr_plugin=fastr_plugin)
+ fastr_plugin=fastr_plugin,
+ use_SMAC=use_SMAC,
+ smac_result_file=smac_result_file)
elif crossval_type == 'LOO':
print('Performing leave-one-out cross-validations.')
logging.debug('Performing leave-one-out cross-validations.')
@@ -751,7 +774,9 @@ Source code for WORC.classification.crossval
fixedsplits=fixedsplits,
fixed_seed=fixed_seed,
use_fastr=use_fastr,
- fastr_plugin=fastr_plugin)
+ fastr_plugin=fastr_plugin,
+ use_SMAC=use_SMAC,
+ smac_result_file=smac_result_file)
else:
raise ae.WORCKeyError(f'{crossval_type} is not a recognized cross-validation type.')
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/estimators.html b/WORC/doc/_build/html/_modules/WORC/classification/estimators.html
index 9f233b4f..559879c5 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/estimators.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/estimators.html
@@ -8,7 +8,7 @@
- WORC.classification.estimators — WORC 3.5.0 documentation
+ WORC.classification.estimators — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/fitandscore.html b/WORC/doc/_build/html/_modules/WORC/classification/fitandscore.html
index 6b8a2046..77e5d3fb 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/fitandscore.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/fitandscore.html
@@ -8,7 +8,7 @@
- WORC.classification.fitandscore — WORC 3.5.0 documentation
+ WORC.classification.fitandscore — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -165,7 +165,7 @@
Source code for WORC.classification.fitandscore
#!/usr/bin/env python
-# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
+# Copyright 2016-2022 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -201,14 +201,20 @@ Source code for WORC.classification.fitandscore
<
from WORC.featureprocessing.OneHotEncoderWrapper import OneHotEncoderWrapper
import WORC
import WORC.addexceptions as ae
+import time
# Specific imports for error management
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from numpy.linalg import LinAlgError
-# Suppress sklearn warnings
+# Suppress some sklearn warnings. These occur when unused hyperparameters are
+# supplied, when estimators that are refitted do not converge, or parts
+# are deprecated
import warnings
+from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=ConvergenceWarning)
[docs]def fit_and_score(X, y, scoring,
@@ -218,9 +224,9 @@ Source code for WORC.classification.fitandscore
<
return_n_test_samples=True,
return_times=True, return_parameters=False,
return_estimator=False,
- error_score='raise', verbose=True,
- return_all=True,
- refit_workflows=False):
+ error_score='raise', verbose=False,
+ return_all=True, refit_workflows=False,
+ use_smac=False):
"""Fit an estimator to a dataset and score the performance.
The following
@@ -453,6 +459,9 @@ Source code for WORC.classification.fitandscore
<
if not return_all:
del encoder
+ # Start the timing
+ start_time = time.time()
+
# ------------------------------------------------------------------------
# Feature imputation
if 'Imputation' in para_estimator.keys():
@@ -460,7 +469,11 @@ Source code for WORC.classification.fitandscore
<
imp_type = para_estimator['ImputationMethod']
if verbose:
print(f'Imputing NaN with {imp_type}.')
- imp_nn = para_estimator['ImputationNeighbours']
+ # Only used with KNN in SMAC, otherwise assign default
+ if 'ImputationNeighbours' in para_estimator.keys():
+ imp_nn = para_estimator['ImputationNeighbours']
+ else:
+ imp_nn = 8
imputer = Imputer(missing_values=np.nan, strategy=imp_type,
n_neighbors=imp_nn)
@@ -477,7 +490,8 @@ Source code for WORC.classification.fitandscore
<
del para_estimator['Imputation']
del para_estimator['ImputationMethod']
- del para_estimator['ImputationNeighbours']
+ if 'ImputationNeighbours' in para_estimator.keys():
+ del para_estimator['ImputationNeighbours']
# Delete the object if we do not need to return it
if not return_all:
@@ -566,6 +580,14 @@ Source code for WORC.classification.fitandscore
<
# Delete the non-used fields
para_estimator = delete_nonestimator_parameters(para_estimator)
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -602,6 +624,14 @@ Source code for WORC.classification.fitandscore
<
print(parameters)
para_estimator = delete_nonestimator_parameters(para_estimator)
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -639,9 +669,8 @@ Source code for WORC.classification.fitandscore
<
if not return_all:
del scaler
-
# --------------------------------------------------------------------
- # Relief feature selection, possibly multi classself.
+ # Relief feature selection, possibly multi class.
# Needs to be done after scaling!
# para_estimator['ReliefUse'] = 'True'
if 'ReliefUse' in para_estimator.keys():
@@ -691,6 +720,14 @@ Source code for WORC.classification.fitandscore
<
print(parameters)
para_estimator = delete_nonestimator_parameters(para_estimator)
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -698,61 +735,63 @@ Source code for WORC.classification.fitandscore
<
# ------------------------------------------------------------------------
# Perform feature selection using a model
- para_estimator['SelectFromModel'] = 'True'
- if 'SelectFromModel' in para_estimator.keys() and para_estimator['SelectFromModel'] == 'True':
- model = para_estimator['SelectFromModel_estimator']
- if verbose:
- print(f"Selecting features using model {model}.")
-
- if model == 'Lasso':
- # Use lasso model for feature selection
- alpha = para_estimator['SelectFromModel_lasso_alpha']
- selectestimator = Lasso(alpha=alpha, random_state=random_seed)
-
- elif model == 'LR':
- # Use logistic regression model for feature selection
- selectestimator = LogisticRegression(random_state=random_seed)
-
- elif model == 'RF':
- # Use random forest model for feature selection
- n_estimators = para_estimator['SelectFromModel_n_trees']
- selectestimator = RandomForestClassifier(n_estimators=n_estimators,
- random_state=random_seed)
- else:
- raise ae.WORCKeyError(f'Model {model} is not known for SelectFromModel. Use Lasso, LR, or RF.')
-
- if len(y_train.shape) >= 2:
- # Multilabel or regression. Regression: second dimension has length 1
- if y_train.shape[1] > 1 and model != 'RF':
- raise ae.WORCValueError(f'Model {model} is not suitable for multiclass classification. Please use RF or do not use SelectFromModel.')
+ if 'SelectFromModel' in para_estimator.keys():
+ if para_estimator['SelectFromModel'] == 'True':
+ model = para_estimator['SelectFromModel_estimator']
+ if verbose:
+ print(f"Selecting features using model {model}.")
+
+ if model == 'Lasso':
+ # Use lasso model for feature selection
+ alpha = para_estimator['SelectFromModel_lasso_alpha']
+ selectestimator = Lasso(alpha=alpha, random_state=random_seed)
+
+ elif model == 'LR':
+ # Use logistic regression model for feature selection
+ selectestimator = LogisticRegression(random_state=random_seed)
+
+ elif model == 'RF':
+ # Use random forest model for feature selection
+ n_estimators = para_estimator['SelectFromModel_n_trees']
+ selectestimator = RandomForestClassifier(n_estimators=n_estimators,
+ random_state=random_seed)
+ else:
+ raise ae.WORCKeyError(f'Model {model} is not known for SelectFromModel. Use Lasso, LR, or RF.')
- # Prefit model
- selectestimator.fit(X_train, y_train)
+ if len(y_train.shape) >= 2:
+ # Multilabel or regression. Regression: second dimension has length 1
+ if y_train.shape[1] > 1 and model != 'RF':
+ raise ae.WORCValueError(f'Model {model} is not suitable for multiclass classification. Please use RF or do not use SelectFromModel.')
- # Use fit to select optimal features
- SelectModel = SelectFromModel(selectestimator, prefit=True)
- if verbose:
- print("\t Original Length: " + str(len(X_train[0])))
+ # Prefit model
+ selectestimator.fit(X_train, y_train)
- X_train_temp = SelectModel.transform(X_train)
- if len(X_train_temp[0]) == 0:
+ # Use fit to select optimal features
+ SelectModel = SelectFromModel(selectestimator, prefit=True)
if verbose:
- print('[WORC WARNING]: No features are selected! Probably your data is too noisy or the selection too strict. Skipping SelectFromModel.')
- SelectModel = None
- parameters['SelectFromModel'] = 'False'
- else:
- X_train = SelectModel.transform(X_train)
- X_test = SelectModel.transform(X_test)
- feature_labels = SelectModel.transform(feature_labels)
+ print("\t Original Length: " + str(len(X_train[0])))
- if verbose:
- print("\t New Length: " + str(len(X_train[0])))
+ X_train_temp = SelectModel.transform(X_train)
+ if len(X_train_temp[0]) == 0:
+ if verbose:
+ print('[WORC WARNING]: No features are selected! Probably your data is too noisy or the selection too strict. Skipping SelectFromModel.')
+ SelectModel = None
+ parameters['SelectFromModel'] = 'False'
+ else:
+ X_train = SelectModel.transform(X_train)
+ X_test = SelectModel.transform(X_test)
+ feature_labels = SelectModel.transform(feature_labels)
+
+ if verbose:
+ print("\t New Length: " + str(len(X_train[0])))
- if 'SelectFromModel' in para_estimator.keys():
del para_estimator['SelectFromModel']
- del para_estimator['SelectFromModel_lasso_alpha']
- del para_estimator['SelectFromModel_estimator']
- del para_estimator['SelectFromModel_n_trees']
+ if 'SelectFromModel_lasso_alpha' in para_estimator.keys():
+ del para_estimator['SelectFromModel_lasso_alpha']
+ if 'SelectFromModel_estimator' in para_estimator.keys():
+ del para_estimator['SelectFromModel_estimator']
+ if 'SelectFromModel_n_trees' in para_estimator.keys():
+ del para_estimator['SelectFromModel_n_trees']
# Delete the object if we do not need to return it
if not return_all:
@@ -766,6 +805,14 @@ Source code for WORC.classification.fitandscore
<
print(parameters)
para_estimator = delete_nonestimator_parameters(para_estimator)
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -788,6 +835,15 @@ Source code for WORC.classification.fitandscore
<
print(f'[WARNING]: skipping this setting due to PCA Error: {e}.')
pca = None
+
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -809,6 +865,15 @@ Source code for WORC.classification.fitandscore
<
print(f'[WARNING]: skipping this setting due to PCA Error: {e}.')
pca = None
+
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -851,7 +916,8 @@ Source code for WORC.classification.fitandscore
<
if 'UsePCA' in para_estimator.keys():
del para_estimator['UsePCA']
- del para_estimator['PCAType']
+ if 'PCAType' in para_estimator.keys():
+ del para_estimator['PCAType']
# --------------------------------------------------------------------
# Feature selection based on a statistical test
@@ -871,8 +937,21 @@ Source code for WORC.classification.fitandscore
<
if len(X_train_temp[0]) == 0:
if verbose:
print('[WORC WARNING]: No features are selected! Probably your statistical test feature selection was too strict. Skipping thresholding.')
- StatisticalSel = None
- parameters['StatisticalTestUse'] = 'False'
+ para_estimator = delete_nonestimator_parameters(para_estimator)
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+ if return_all:
+ return ret, GroupSel, VarSel, SelectModel,\
+ feature_labels[0], scaler, encoder, imputer, pca,\
+ StatisticalSel, ReliefSel, Sampler
+ else:
+ return ret
+
else:
X_train = StatisticalSel.transform(X_train)
X_test = StatisticalSel.transform(X_test)
@@ -881,9 +960,13 @@ Source code for WORC.classification.fitandscore
<
if verbose:
print("\t New Length: " + str(len(X_train[0])))
+ # Delete the statistical test keys
del para_estimator['StatisticalTestUse']
- del para_estimator['StatisticalTestMetric']
- del para_estimator['StatisticalTestThreshold']
+ if 'StatisticalTestMetric' in para_estimator.keys():
+ del para_estimator['StatisticalTestMetric']
+
+ if 'StatisticalTestThreshold' in para_estimator.keys():
+ del para_estimator['StatisticalTestThreshold']
# Delete the object if we do not need to return it
if not return_all:
@@ -899,8 +982,21 @@ Source code for WORC.classification.fitandscore
<
neg_initial = int(len(y_train) - pos_initial)
len_in = len(y_train)
+ # If SMAC has removed a certain parameter, add a dummy altough
+ # it's not actually used
+ if 'Resampling_sampling_strategy' not in para_estimator.keys():
+ para_estimator['Resampling_sampling_strategy'] = None
+
+ if 'Resampling_n_neighbors' not in para_estimator.keys():
+ para_estimator['Resampling_n_neighbors'] = None
+
+ if 'Resampling_k_neighbors' not in para_estimator.keys():
+ para_estimator['Resampling_k_neighbors'] = None
+
+ if 'Resampling_threshold_cleaning' not in para_estimator.keys():
+ para_estimator['Resampling_threshold_cleaning'] = None
+
# Fit ObjectSampler and transform dataset
- # NOTE: need to save random state for this one as well!
Sampler =\
ObjectSampler(method=para_estimator['Resampling_Method'],
sampling_strategy=para_estimator['Resampling_sampling_strategy'],
@@ -930,6 +1026,14 @@ Source code for WORC.classification.fitandscore
<
print(parameters)
para_estimator = delete_nonestimator_parameters(para_estimator)
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -960,13 +1064,25 @@ Source code for WORC.classification.fitandscore
<
train = np.arange(0, len(y_train))
test = np.arange(len(y_train), len(y_train) + len(y_test))
+ # Delete the resampling parameters
del para_estimator['Resampling_Use']
- del para_estimator['Resampling_Method']
- del para_estimator['Resampling_sampling_strategy']
- del para_estimator['Resampling_n_neighbors']
- del para_estimator['Resampling_k_neighbors']
- del para_estimator['Resampling_threshold_cleaning']
- del para_estimator['Resampling_n_cores']
+ if 'Resampling_Method' in para_estimator.keys():
+ del para_estimator['Resampling_Method']
+
+ if 'Resampling_sampling_strategy' in para_estimator.keys():
+ del para_estimator['Resampling_sampling_strategy']
+
+ if 'Resampling_n_neighbors' in para_estimator.keys():
+ del para_estimator['Resampling_n_neighbors']
+
+ if 'Resampling_k_neighbors' in para_estimator.keys():
+ del para_estimator['Resampling_k_neighbors']
+
+ if 'Resampling_threshold_cleaning' in para_estimator.keys():
+ del para_estimator['Resampling_threshold_cleaning']
+
+ if 'Resampling_n_cores' in para_estimator.keys():
+ del para_estimator['Resampling_n_cores']
# Delete the object if we do not need to return it
if not return_all:
@@ -1020,6 +1136,14 @@ Source code for WORC.classification.fitandscore
<
if verbose:
print(f'[WARNING]: skipping this setting due to LDA Error: {e}.')
+ # Update the runtime
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -1037,6 +1161,14 @@ Source code for WORC.classification.fitandscore
<
train=indices, test=indices)
ret.append(estimator)
+ # End the timing and store the fit_time
+ end_time = time.time()
+ runtime = end_time - start_time
+ if return_train_score:
+ ret[3] = runtime
+ else:
+ ret[2] = runtime
+
if return_all:
return ret, GroupSel, VarSel, SelectModel, feature_labels[0], scaler, encoder, imputer, pca, StatisticalSel, ReliefSel, Sampler
else:
@@ -1049,60 +1181,41 @@ Source code for WORC.classification.fitandscore
<
Delete all parameters in a parameter dictionary that are not used for the
actual estimator.
"""
- if 'Number' in parameters.keys():
- del parameters['Number']
-
- if 'UsePCA' in parameters.keys():
- del parameters['UsePCA']
- del parameters['PCAType']
-
- if 'ReliefUse' in parameters.keys():
- del parameters['ReliefUse']
- del parameters['ReliefNN']
- del parameters['ReliefSampleSize']
- del parameters['ReliefDistanceP']
- del parameters['ReliefNumFeatures']
-
- if 'OneHotEncoding' in parameters.keys():
- del parameters['OneHotEncoding']
- del parameters['OneHotEncoding_feature_labels_tofit']
-
- if 'Imputation' in parameters.keys():
- del parameters['Imputation']
- del parameters['ImputationMethod']
- del parameters['ImputationNeighbours']
-
- if 'SelectFromModel' in parameters.keys():
- del parameters['SelectFromModel']
- del parameters['SelectFromModel_lasso_alpha']
- del parameters['SelectFromModel_estimator']
- del parameters['SelectFromModel_n_trees']
-
- if 'Featsel_Variance' in parameters.keys():
- del parameters['Featsel_Variance']
-
- if 'FeatPreProcess' in parameters.keys():
- del parameters['FeatPreProcess']
-
- if 'FeatureScaling' in parameters.keys():
- del parameters['FeatureScaling']
-
- if 'StatisticalTestUse' in parameters.keys():
- del parameters['StatisticalTestUse']
- del parameters['StatisticalTestMetric']
- del parameters['StatisticalTestThreshold']
-
- if 'Resampling_Use' in parameters.keys():
- del parameters['Resampling_Use']
- del parameters['Resampling_Method']
- del parameters['Resampling_sampling_strategy']
- del parameters['Resampling_n_neighbors']
- del parameters['Resampling_k_neighbors']
- del parameters['Resampling_threshold_cleaning']
- del parameters['Resampling_n_cores']
-
- if 'random_seed' in parameters.keys():
- del parameters['random_seed']
+ deletekeys = ['Number',
+ 'UsePCA',
+ 'PCAType',
+ 'ReliefUse',
+ 'ReliefNN',
+ 'ReliefSampleSize',
+ 'ReliefNumFeatures',
+ 'OneHotEncoding',
+ 'OneHotEncoding_feature_labels_tofit',
+ 'Imputation',
+ 'ImputationMethod',
+ 'ImputationNeighbours',
+ 'SelectFromModel',
+ 'SelectFromModel_lasso_alpha',
+ 'SelectFromModel_estimator',
+ 'SelectFromModel_n_trees',
+ 'Featsel_Variance',
+ 'FeatPreProcess',
+ 'FeatureScaling',
+ 'StatisticalTestUse',
+ 'StatisticalTestMetric',
+ 'StatisticalTestThreshold',
+ 'Resampling_Use',
+ 'Resampling_Method',
+ 'Resampling_sampling_strategy',
+ 'Resampling_n_cores',
+ 'Resampling_n_neighbors',
+ 'Resampling_k_neighbors',
+ 'Resampling_threshold_cleaning',
+ 'random_seed'
+ ]
+
+ for k in deletekeys:
+ if k in parameters.keys():
+ del parameters[k]
return parameters
@@ -1159,7 +1272,13 @@ Source code for WORC.classification.fitandscore
<
'XGB_learning_rate',
'XGB_gamma',
'XGB_min_child_weight',
- 'XGB_colsample_bytree']
+ 'XGB_colsample_bytree',
+ 'LightGBM_num_leaves',
+ 'LightGBM_max_depth',
+ 'LightGBM_min_child_samples',
+ 'LightGBM_reg_alpha',
+ 'LightGBM_reg_lambda',
+ 'LightGBM_min_child_weight']
for k in deletekeys:
if k in para.keys():
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/metrics.html b/WORC/doc/_build/html/_modules/WORC/classification/metrics.html
index 187375f8..3a6c66d0 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/metrics.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/metrics.html
@@ -8,7 +8,7 @@
- WORC.classification.metrics — WORC 3.5.0 documentation
+ WORC.classification.metrics — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/parameter_optimization.html b/WORC/doc/_build/html/_modules/WORC/classification/parameter_optimization.html
index 7ca0ce99..1315e80f 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/parameter_optimization.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/parameter_optimization.html
@@ -8,7 +8,7 @@
- WORC.classification.parameter_optimization — WORC 3.5.0 documentation
+ WORC.classification.parameter_optimization — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -183,15 +183,15 @@ Source code for WORC.classification.parameter_optimization
import numpy as np
from sklearn.utils import check_random_state
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
-from WORC.classification.SearchCV import RandomizedSearchCVfastr, RandomizedSearchCVJoblib
+from WORC.classification.SearchCV import RandomizedSearchCVfastr, RandomizedSearchCVJoblib, GuidedSearchCVSMAC
[docs]def random_search_parameters(features, labels, N_iter, test_size,
param_grid, scoring_method, n_splits=5,
n_jobspercore=200, use_fastr=False,
- n_cores=1, fastr_plugin=None, memory='2G',
- maxlen=100,
- ranking_score='test_score', random_seed=None,
+ n_cores=1, fastr_plugin=None,
+ memory='2G', maxlen=100, ranking_score='test_score',
+ random_seed=None,
refit_workflows=False):
"""
Train a classifier and simultaneously optimizes hyperparameters using a
@@ -220,7 +220,9 @@ Source code for WORC.classification.parameter_optimization
random_search: sklearn randomsearch object containing the results.
"""
if random_seed is None:
- random_seed = np.random.randint(1, 5000)
+ #random_seed = np.random.randint(1, 5000)
+ # Fix the random seed for testing
+ random_seed = 42
random_state = check_random_state(random_seed)
regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']
@@ -263,6 +265,77 @@ Source code for WORC.classification.parameter_optimization
print(f"\n Best score using best parameters: {scoring_method} = {random_search.best_score_}")
return random_search
+
+
+[docs]def guided_search_parameters(features, labels, N_iter, test_size,
+ parameters, scoring_method, n_splits=5,
+ n_jobspercore=200, use_fastr=False,
+ n_cores=1, fastr_plugin=None,
+ memory='2G', maxlen=100, ranking_score='test_score',
+ random_seed=None, refit_workflows=False,
+ smac_result_file=None):
+ """
+ Train a classifier and simultaneously optimizes hyperparameters using a
+ Bayesian optimization approach.
+
+ Arguments:
+ features: numpy array containing the training features.
+ labels: list containing the object labels to be trained on.
+ N_iter: integer listing the number of iterations to be used in the
+ hyperparameter optimization.
+ test_size: float listing the test size percentage used in the cross
+ validation.
+ classifier: sklearn classifier to be tested
+ param_grid: dictionary containing all possible hyperparameters and their
+ values or distrubitions.
+ scoring_method: string defining scoring method used in optimization,
+ e.g. f1_weighted for a SVM.
+ n_jobsperscore: integer listing the number of jobs that are ran on a
+ single core when using the fastr randomized search.
+ use_fastr: Boolean determining of either fastr or joblib should be used
+ for the opimization.
+ fastr_plugin: determines which plugin is used for fastr executions.
+ When None, uses the default plugin from the fastr config.
+
+ Returns:
+ guided_search: object containing the results
+ """
+ if random_seed is None:
+ #random_seed = np.random.randint(1, 5000)
+ # Fix the random seed for testing
+ random_seed = 42
+ random_state = check_random_state(random_seed)
+
+ regressors = ['SVR', 'RFR', 'SGDR', 'Lasso', 'ElasticNet']
+ if any(clf in regressors for clf in parameters['Classification']['classifiers']):
+ # We cannot do a stratified shuffle split with regression
+ cv = ShuffleSplit(n_splits=n_splits, test_size=test_size,
+ random_state=random_state)
+ else:
+ cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size,
+ random_state=random_state)
+
+ guided_search = GuidedSearchCVSMAC(param_distributions=parameters,
+ n_iter=N_iter,
+ scoring=scoring_method,
+ n_jobs=n_cores,
+ n_jobspercore=n_jobspercore,
+ maxlen=maxlen,
+ verbose=1, cv=cv,
+ fastr_plugin=fastr_plugin,
+ ranking_score=ranking_score,
+ features=features,
+ labels=labels,
+ smac_result_file=smac_result_file)
+
+ guided_search.fit(features, labels)
+ print("Best found parameters:")
+ for i in guided_search.best_params_:
+ print(f'{i}: {guided_search.best_params_[i]}.')
+ print("\n Best score using best parameters:")
+ print(guided_search.best_score_)
+
+ return guided_search
diff --git a/WORC/doc/_build/html/_modules/WORC/classification/trainclassifier.html b/WORC/doc/_build/html/_modules/WORC/classification/trainclassifier.html
index 9942bfa6..8f6ae959 100644
--- a/WORC/doc/_build/html/_modules/WORC/classification/trainclassifier.html
+++ b/WORC/doc/_build/html/_modules/WORC/classification/trainclassifier.html
@@ -8,7 +8,7 @@
- WORC.classification.trainclassifier — WORC 3.5.0 documentation
+ WORC.classification.trainclassifier — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -165,7 +165,7 @@
Source code for WORC.classification.trainclassifier
#!/usr/bin/env python
-# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
+# Copyright 2016-2022 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -181,6 +181,7 @@ Source code for WORC.classification.trainclassifier
# limitations under the License.
import os
+import numpy as np
from scipy.stats import uniform
from WORC.classification import crossval as cv
from WORC.classification import construct_classifier as cc
@@ -188,12 +189,13 @@ Source code for WORC.classification.trainclassifier
import WORC.IOparser.config_io_classifier as config_io
from WORC.classification.AdvancedSampler import discrete_uniform, \
log_uniform, boolean_uniform
+import json
[docs]def trainclassifier(feat_train, patientinfo_train, config,
output_hdf,
feat_test=None, patientinfo_test=None,
- fixedsplits=None, verbose=True):
+ fixedsplits=None, output_smac=None, verbose=True):
"""Train a classifier using machine learning from features.
By default, if no
@@ -270,6 +272,14 @@ Source code for WORC.classification.trainclassifier
if type(fixedsplits) is list:
fixedsplits = ''.join(fixedsplits)
+ if type(output_smac) is list:
+ if len(output_smac) == 1:
+ output_smac = ''.join(output_smac)
+ else:
+ # FIXME
+ print('[WORC Warning] You provided multiple output json files: only the first one will be used!')
+ output_smac = output_smac[0]
+
# Load variables from the config file
config = config_io.load_config(config)
label_type = config['Labels']['label_names']
@@ -306,6 +316,7 @@ Source code for WORC.classification.trainclassifier
# For N_iter, perform k-fold crossvalidation
outputfolder = os.path.dirname(output_hdf)
+ smac_result_file = output_smac
if feat_test is None:
trained_classifier = cv.crossval(config, label_data_train,
image_features_train,
@@ -316,7 +327,9 @@ Source code for WORC.classification.trainclassifier
fixedsplits=fixedsplits,
ensemble=config['Ensemble'],
outputfolder=outputfolder,
- tempsave=config['General']['tempsave'])
+ tempsave=config['General']['tempsave'],
+ use_SMAC=config['SMAC']['use'],
+ smac_result_file=smac_result_file)
else:
trained_classifier = cv.nocrossval(config, label_data_train,
label_data_test,
diff --git a/WORC/doc/_build/html/_modules/WORC/detectors/detectors.html b/WORC/doc/_build/html/_modules/WORC/detectors/detectors.html
index e1f7f1e6..938d1dd6 100644
--- a/WORC/doc/_build/html/_modules/WORC/detectors/detectors.html
+++ b/WORC/doc/_build/html/_modules/WORC/detectors/detectors.html
@@ -8,7 +8,7 @@
- WORC.detectors.detectors — WORC 3.5.0 documentation
+ WORC.detectors.detectors — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -229,11 +229,11 @@ Source code for WORC.detectors.detectors
return False
-[docs]class CartesiusClusterDetector(AbstractDetector):
+[docs]class SnelliusClusterDetector(AbstractDetector):
def _is_detected(self):
if LinuxDetector()._is_detected():
try:
- if 'cartesius' in Path('/etc/hosts').read_text():
+ if 'localhost6.localdomain6' in Path('/etc/hosts').read_text():
return True
except:
return False
diff --git a/WORC/doc/_build/html/_modules/WORC/exampledata/datadownloader.html b/WORC/doc/_build/html/_modules/WORC/exampledata/datadownloader.html
index 58bb52ed..08383a48 100644
--- a/WORC/doc/_build/html/_modules/WORC/exampledata/datadownloader.html
+++ b/WORC/doc/_build/html/_modules/WORC/exampledata/datadownloader.html
@@ -8,7 +8,7 @@
- WORC.exampledata.datadownloader — WORC 3.5.0 documentation
+ WORC.exampledata.datadownloader — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/featureprocessing/Imputer.html b/WORC/doc/_build/html/_modules/WORC/featureprocessing/Imputer.html
index 79e9df03..1945c0da 100644
--- a/WORC/doc/_build/html/_modules/WORC/featureprocessing/Imputer.html
+++ b/WORC/doc/_build/html/_modules/WORC/featureprocessing/Imputer.html
@@ -8,7 +8,7 @@
- WORC.featureprocessing.Imputer — WORC 3.5.0 documentation
+ WORC.featureprocessing.Imputer — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/featureprocessing/Relief.html b/WORC/doc/_build/html/_modules/WORC/featureprocessing/Relief.html
index 474d322b..2d98c9a6 100644
--- a/WORC/doc/_build/html/_modules/WORC/featureprocessing/Relief.html
+++ b/WORC/doc/_build/html/_modules/WORC/featureprocessing/Relief.html
@@ -8,7 +8,7 @@
- WORC.featureprocessing.Relief — WORC 3.5.0 documentation
+ WORC.featureprocessing.Relief — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/featureprocessing/SelectGroups.html b/WORC/doc/_build/html/_modules/WORC/featureprocessing/SelectGroups.html
index 21fd4d03..ac0b0219 100644
--- a/WORC/doc/_build/html/_modules/WORC/featureprocessing/SelectGroups.html
+++ b/WORC/doc/_build/html/_modules/WORC/featureprocessing/SelectGroups.html
@@ -8,7 +8,7 @@
- WORC.featureprocessing.SelectGroups — WORC 3.5.0 documentation
+ WORC.featureprocessing.SelectGroups — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/featureprocessing/SelectIndividuals.html b/WORC/doc/_build/html/_modules/WORC/featureprocessing/SelectIndividuals.html
index aa873206..9191915d 100644
--- a/WORC/doc/_build/html/_modules/WORC/featureprocessing/SelectIndividuals.html
+++ b/WORC/doc/_build/html/_modules/WORC/featureprocessing/SelectIndividuals.html
@@ -8,7 +8,7 @@
- WORC.featureprocessing.SelectIndividuals — WORC 3.5.0 documentation
+ WORC.featureprocessing.SelectIndividuals — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/featureprocessing/StatisticalTestFeatures.html b/WORC/doc/_build/html/_modules/WORC/featureprocessing/StatisticalTestFeatures.html
index a5797ff2..da8876cd 100644
--- a/WORC/doc/_build/html/_modules/WORC/featureprocessing/StatisticalTestFeatures.html
+++ b/WORC/doc/_build/html/_modules/WORC/featureprocessing/StatisticalTestFeatures.html
@@ -8,7 +8,7 @@
- WORC.featureprocessing.StatisticalTestFeatures — WORC 3.5.0 documentation
+ WORC.featureprocessing.StatisticalTestFeatures — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/featureprocessing/StatisticalTestThreshold.html b/WORC/doc/_build/html/_modules/WORC/featureprocessing/StatisticalTestThreshold.html
index 2be7d257..0e74c9c1 100644
--- a/WORC/doc/_build/html/_modules/WORC/featureprocessing/StatisticalTestThreshold.html
+++ b/WORC/doc/_build/html/_modules/WORC/featureprocessing/StatisticalTestThreshold.html
@@ -8,7 +8,7 @@
- WORC.featureprocessing.StatisticalTestThreshold — WORC 3.5.0 documentation
+ WORC.featureprocessing.StatisticalTestThreshold — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/featureprocessing/VarianceThreshold.html b/WORC/doc/_build/html/_modules/WORC/featureprocessing/VarianceThreshold.html
index 133a13ef..752b689d 100644
--- a/WORC/doc/_build/html/_modules/WORC/featureprocessing/VarianceThreshold.html
+++ b/WORC/doc/_build/html/_modules/WORC/featureprocessing/VarianceThreshold.html
@@ -8,7 +8,7 @@
- WORC.featureprocessing.VarianceThreshold — WORC 3.5.0 documentation
+ WORC.featureprocessing.VarianceThreshold — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/plotting/compute_CI.html b/WORC/doc/_build/html/_modules/WORC/plotting/compute_CI.html
index 22dbb49e..7cc98b22 100644
--- a/WORC/doc/_build/html/_modules/WORC/plotting/compute_CI.html
+++ b/WORC/doc/_build/html/_modules/WORC/plotting/compute_CI.html
@@ -8,7 +8,7 @@
- WORC.plotting.compute_CI — WORC 3.5.0 documentation
+ WORC.plotting.compute_CI — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/plotting/linstretch.html b/WORC/doc/_build/html/_modules/WORC/plotting/linstretch.html
index 68675c31..b5f29baa 100644
--- a/WORC/doc/_build/html/_modules/WORC/plotting/linstretch.html
+++ b/WORC/doc/_build/html/_modules/WORC/plotting/linstretch.html
@@ -8,7 +8,7 @@
- WORC.plotting.linstretch — WORC 3.5.0 documentation
+ WORC.plotting.linstretch — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/plotting/plot_ROC.html b/WORC/doc/_build/html/_modules/WORC/plotting/plot_ROC.html
index fa6d0c04..fc4bb635 100644
--- a/WORC/doc/_build/html/_modules/WORC/plotting/plot_ROC.html
+++ b/WORC/doc/_build/html/_modules/WORC/plotting/plot_ROC.html
@@ -8,7 +8,7 @@
- WORC.plotting.plot_ROC — WORC 3.5.0 documentation
+ WORC.plotting.plot_ROC — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -699,8 +699,11 @@ Source code for WORC.plotting.plot_ROC
parser.add_argument('-pinfo', '--pinfo', metavar='pinfo',
nargs='+', dest='pinfo', type=str, required=True,
help='Patient Info File (txt)')
- parser.add_argument('-ensemble', '--ensemble', metavar='ensemble',
- nargs='+', dest='ensemble', type=str, required=True,
+ parser.add_argument('-ensemble_method', '--ensemble_method', metavar='ensemble_method',
+ nargs='+', dest='ensemble_method', type=str, required=True,
+ help='Method for creating ensemble (string)')
+ parser.add_argument('-ensemble_size', '--ensemble_size', metavar='ensemble_size',
+ nargs='+', dest='ensemble_size', type=str, required=False,
help='Length of ensemble (int)')
parser.add_argument('-label_type', '--label_type', metavar='label_type',
nargs='+', dest='label_type', type=str, required=True,
@@ -727,7 +730,8 @@ Source code for WORC.plotting.plot_ROC
plot_ROC(prediction=args.prediction,
pinfo=args.pinfo,
- ensemble=args.ensemble,
+ ensemble_method=args.ensemble_method,
+ ensemble_size=args.ensemble_size,
label_type=args.label_type,
ROC_png=args.ROC_png,
ROC_tex=args.ROC_tex,
@@ -737,7 +741,8 @@ Source code for WORC.plotting.plot_ROC
PRC_csv=args.PRC_csv)
-[docs]def plot_ROC(prediction, pinfo, ensemble=1, label_type=None,
+[docs]def plot_ROC(prediction, pinfo, ensemble_method='top_N',
+ ensemble_size=1, label_type=None,
ROC_png=None, ROC_tex=None, ROC_csv=None,
PRC_png=None, PRC_tex=None, PRC_csv=None):
# Convert the inputs to the correct format
@@ -747,8 +752,11 @@ Source code for WORC.plotting.plot_ROC
if type(pinfo) is list:
pinfo = ''.join(pinfo)
- if type(ensemble) is list:
- ensemble = int(ensemble[0])
+ if type(ensemble_method) is list:
+ ensemble_method = ''.join(ensemble_method)
+
+ if type(ensemble_size) is list:
+ ensemble_size = int(ensemble_size[0])
if type(ROC_png) is list:
ROC_png = ''.join(ROC_png)
@@ -787,7 +795,8 @@ Source code for WORC.plotting.plot_ROC
print('Determining score per patient.')
y_truths, y_scores, _, _ =\
plot_estimator_performance(prediction, pinfo, [label_type],
- alpha=0.95, ensemble=ensemble,
+ alpha=0.95, ensemble_method=ensemble_method,
+ ensemble_size=ensemble_size,
output='decision')
# Check if we can compute confidence intervals
diff --git a/WORC/doc/_build/html/_modules/WORC/plotting/plot_barchart.html b/WORC/doc/_build/html/_modules/WORC/plotting/plot_barchart.html
index 525984b0..6f57b26b 100644
--- a/WORC/doc/_build/html/_modules/WORC/plotting/plot_barchart.html
+++ b/WORC/doc/_build/html/_modules/WORC/plotting/plot_barchart.html
@@ -8,7 +8,7 @@
- WORC.plotting.plot_barchart — WORC 3.5.0 documentation
+ WORC.plotting.plot_barchart — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/plotting/plot_images.html b/WORC/doc/_build/html/_modules/WORC/plotting/plot_images.html
index 426ad077..20e8a132 100644
--- a/WORC/doc/_build/html/_modules/WORC/plotting/plot_images.html
+++ b/WORC/doc/_build/html/_modules/WORC/plotting/plot_images.html
@@ -8,7 +8,7 @@
- WORC.plotting.plot_images — WORC 3.5.0 documentation
+ WORC.plotting.plot_images — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -165,7 +165,7 @@
Source code for WORC.plotting.plot_images
#!/usr/bin/env python
-# Copyright 2016-2019 Biomedical Imaging Group Rotterdam, Departments of
+# Copyright 2016-2021 Biomedical Imaging Group Rotterdam, Departments of
# Medical Informatics and Radiology, Erasmus MC, Rotterdam, The Netherlands
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -189,6 +189,7 @@ Source code for WORC.plotting.plot_images
import matplotlib.colors as colors
import SimpleITK as sitk
from skimage import morphology
+import WORC.addexceptions as ae
[docs]def extract_boundary(contour, radius=2):
@@ -216,9 +217,10 @@ Source code for WORC.plotting.plot_images
[docs]def slicer(image, mask=None, output_name=None, output_name_zoom=None,
- thresholds=[-240, 160], zoomfactor=4, dpi=500, normalize=False,
+ thresholds=[-5, 5], zoomfactor=4, dpi=500, normalize=True,
expand=False, boundary=False, square=False, flip=True, rot90=0,
- alpha=0.40, axis='axial', index=None, color='cyan', radius=2):
+ alpha=0.40, axis='axial', index=None, color='cyan', radius=2,
+ colormap='gray'):
"""Plot slice of image where mask is largest, with mask as overlay.
image and mask should both be arrays
@@ -371,7 +373,7 @@ Source code for WORC.plotting.plot_images
# Plot the image and overlay the mask
fig = plot_im_and_overlay(imslice, maskslice, figsize=figsize, alpha=alpha,
- color=color)
+ color=color, colormap=colormap)
# Save Output
print('\t Saving output.')
@@ -400,8 +402,13 @@ Source code for WORC.plotting.plot_images
[docs]def plot_im_and_overlay(image, mask=None, figsize=(3, 3), alpha=0.40,
- color='cyan'):
+ color='cyan', colormap='gray', colorbar=False):
"""Plot an image in a matplotlib figure and overlay with a mask."""
+ # Define colormap
+ validmaps = ['gray', 'turbo', 'jet']
+ if colormap not in validmaps:
+ raise ae.WORCKeyError(f'Colormap {colormap} is not valid. Should be one of {validmaps}.')
+
# Create a normalized colormap for the image and mask
imin = np.min(image)
imax = np.max(image)
@@ -415,10 +422,14 @@ Source code for WORC.plotting.plot_images
# Plot and save the full image
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(1, 1, 1)
- ax.imshow(image, cmap=plt.cm.gray, norm=norm_im, interpolation="bilinear")
+ mappable = ax.imshow(image, cmap=colormap, norm=norm_im, interpolation="bilinear")
if mask is not None:
ax.imshow(mask, cmap=cmap, norm=normO, alpha=alpha, interpolation="bilinear")
+ # Add colorbar
+ if colorbar:
+ fig.colorbar(mappable)
+
# Alter aspect ratio according to figure size
aspect = figsize[0]/figsize[1]
ax.set_aspect(aspect)
diff --git a/WORC/doc/_build/html/_modules/WORC/plotting/plot_ranked_scores.html b/WORC/doc/_build/html/_modules/WORC/plotting/plot_ranked_scores.html
index 5975ad66..4961ea82 100644
--- a/WORC/doc/_build/html/_modules/WORC/plotting/plot_ranked_scores.html
+++ b/WORC/doc/_build/html/_modules/WORC/plotting/plot_ranked_scores.html
@@ -8,7 +8,7 @@
- WORC.plotting.plot_ranked_scores — WORC 3.5.0 documentation
+ WORC.plotting.plot_ranked_scores — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -212,9 +212,12 @@ Source code for WORC.plotting.plot_ranked_scores
parser.add_argument('-segmentations', '--segmentations', metavar='segmentations',
nargs='+', dest='segs', type=str, required=True,
help='Segmentations of patients (ITK Image files)')
- parser.add_argument('-ensemble', '--ensemble', metavar='ensemble',
- nargs='+', dest='ens', type=str, required=True,
- help='Either length of ensemble (int) or Caruana (string)')
+ parser.add_argument('-ensemble_method', '--ensemble_method', metavar='ensemble_method',
+ nargs='+', dest='ens_method', type=str, required=True,
+ help='Method to be used for ensembling (string)')
+ parser.add_argument('-ensemble_size', '--ensemble_size', metavar='ensemble_size',
+ nargs='+', dest='ens_size', type=str, required=True,
+ help='If ensembling method is top_N, size to be used (int)')
parser.add_argument('-label_type', '--label_type', metavar='label_type',
nargs='+', dest='label_type', type=str, required=True,
help='Label name that is predicted by estimator (string)')
@@ -238,9 +241,13 @@ Source code for WORC.plotting.plot_ranked_scores
if type(estimator) is list:
estimator = ''.join(estimator)
- ensemble = args.ens
- if type(ensemble) is list:
- ensemble = ''.join(ensemble)
+ ensemble_method = args.ens_method
+ if type(ensemble_method) is list:
+ ensemble_method = ''.join(ensemble_method)
+
+ ensemble_size = args.ens_size
+ if type(ensemble_size) is list:
+ ensemble_size = int(ensemble_size[0])
label_type = args.label_type
if type(label_type) is list:
@@ -264,7 +271,8 @@ Source code for WORC.plotting.plot_ranked_scores
scores=scores,
images=args.ims,
segmentations=args.segs,
- ensemble=ensemble,
+ ensemble_method=ensemble_method,
+ ensemble_size=ensemble_size,
output_csv=output_csv,
output_zip=output_zip)
@@ -282,7 +290,8 @@ Source code for WORC.plotting.plot_ranked_scores
[docs]def plot_ranked_percentages(estimator, pinfo, label_type=None,
- ensemble=50, output_csv=None):
+ ensemble_method='top_N', ensemble_size=100,
+ output_csv=None):
# Read the inputs
prediction = pd.read_hdf(estimator)
@@ -294,7 +303,8 @@ Source code for WORC.plotting.plot_ranked_scores
pinfo,
[label_type],
alpha=0.95,
- ensemble=ensemble,
+ ensemble_method=ensemble_method,
+ ensemble_size=ensemble_size,
output='stats')
percentages = stats['Rankings']['Percentages']
@@ -418,7 +428,8 @@ Source code for WORC.plotting.plot_ranked_scores
[docs]def plot_ranked_posteriors(estimator, pinfo, label_type=None,
- ensemble=50, output_csv=None):
+ ensemble_method='top_N', ensemble_size=100,
+ output_csv=None):
# Read the inputs
prediction = pd.read_hdf(estimator)
if label_type is None:
@@ -432,7 +443,8 @@ Source code for WORC.plotting.plot_ranked_scores
pinfo,
[label_type],
alpha=0.95,
- ensemble=ensemble,
+ ensemble_method=ensemble_method,
+ ensemble_size=ensemble_size,
output='scores')
# Extract all scores for each patient
@@ -546,7 +558,8 @@ Source code for WORC.plotting.plot_ranked_scores
[docs]def plot_ranked_scores(estimator, pinfo, label_type, scores='percentages',
- images=[], segmentations=[], ensemble=50,
+ images=[], segmentations=[], ensemble_method='top_N',
+ ensemble_size=100,
output_csv=None, output_zip=None, output_itk=None):
'''
Rank the patients according to their average score. The score can either
@@ -579,10 +592,11 @@ Source code for WORC.plotting.plot_ranked_scores
List containing the filepaths to the ITKImage segmentation files of
the patients.
- ensemble: integer or string, optional
- Method to be used for ensembling. Either an integer for a fixed size
- or 'Caruana' for the Caruana method, see the SearchCV function for more
- details.
+ ensemble_method: string, optional
+ Method to be used for ensembling.
+
+ ensemble_size: int, optional
+ If top_N method is used, number of workflows to be included in ensemble.
output_csv: filepath, optional
If given, the scores will be written to this csv file.
@@ -605,8 +619,10 @@ Source code for WORC.plotting.plot_ranked_scores
plot_ranked_posteriors(estimator=estimator,
pinfo=pinfo,
label_type=label_type,
- ensemble=ensemble,
+ ensemble_method=ensemble_method,
+ ensemble_size=ensemble_size,
output_csv=output_csv)
+
elif scores == 'percentages':
if prediction[prediction.keys()[0]].config['CrossValidation']['Type'] == 'LOO':
print('Cannot rank percentages for LOO, returning dummies.')
@@ -619,7 +635,8 @@ Source code for WORC.plotting.plot_ranked_scores
plot_ranked_percentages(estimator=estimator,
pinfo=pinfo,
label_type=label_type,
- ensemble=ensemble,
+ ensemble_method=ensemble_method,
+ ensemble_size=ensemble_size,
output_csv=output_csv)
else:
message = ('{} is not a valid scoring method!').format(str(scores))
@@ -671,113 +688,6 @@ Source code for WORC.plotting.plot_ranked_scores
'w', zipfile.ZIP_DEFLATED, allowZip64=True)
-[docs]def example():
- case = 'MESFIB'
- if case == 'CLM':
- label_type = None
- estimator = '/media/martijn/DATA/tmp/classification_0_nonewfeat.hdf5'
- ensemble = 50
- scores = 'percentages'
- pinfo = '/home/martijn/git/RadTools/CLM/pinfo_CLM_KM.txt'
- images_temp = glob.glob('/media/martijn/DATA/CLM/*/*/*/image.nii.gz')
- segmentations = list()
- images = list()
- for i in images_temp:
- segs = glob.glob(os.path.dirname(i) + '/seg_*session2*.nii.gz')
- if len(segs) == 1:
- segmentations.append(segs[0])
- images.append(i)
- elif len(segs) > 1:
- segmentations.append(segs[0])
- images.append(i)
- else:
- segs = glob.glob(os.path.dirname(i) + '/seg_*session1*.nii.gz')
- if len(segs) == 1:
- segmentations.append(segs[0])
- images.append(i)
- elif len(segs) > 1:
- segmentations.append(segs[0])
- images.append(i)
- else:
- print(i)
-
- output_csv = '/media/martijn/DATA/tmp/classification_0_nonewfeat_percentages.csv'
- output_zip = '/media/martijn/DATA/tmp/classification_0_nonewfeat_percentages.zip'
- elif case == 'MESFIB':
- label_type = None
- estimator = '/media/martijn/DATA/MESFIB/Results_0704/classification_100crossval_nonewfeat.hdf5'
- ensemble = 50
- scores = 'percentages'
- pinfo = '/home/martijn/git/RadTools/MESFIB/pinfo_MESFIB.txt'
- images_temp = glob.glob('/media/martijn/DATA/MESFIB/*/*/*/image.nii.gz')
- segmentations = list()
- images = list()
- for i in images_temp:
- segs = glob.glob(os.path.dirname(i) + '/seg*Mass*.nii.gz')
- if len(segs) == 1:
- segmentations.append(segs[0])
- images.append(i)
- elif len(segs) > 1:
- segmentations.append(segs[0])
- images.append(i)
- else:
- segs = glob.glob(os.path.dirname(i) + '/seg_*mass*.nii.gz')
- if len(segs) == 1:
- segmentations.append(segs[0])
- images.append(i)
- elif len(segs) > 1:
- segmentations.append(segs[0])
- images.append(i)
- else:
- print(i)
-
- output_csv = '/media/martijn/DATA/MESFIB/Results_0704/classification_100crossval_nonewfeat_percentages.csv'
- output_zip = '/media/martijn/DATA/MESFIB/Results_0704/classification_100crossval_nonewfeat_percentages.zip'
-
- prediction = pd.read_hdf(estimator)
- if label_type is None:
- # Assume we want to have the first key
- label_type = prediction.keys()[0]
-
- if scores == 'posteriors':
- ranked_scores, ranked_truths, ranked_PIDs =\
- plot_ranked_posteriors(estimator=estimator,
- pinfo=pinfo,
- label_type=label_type,
- ensemble=ensemble,
- output_csv=output_csv)
- elif scores == 'percentages':
- ranked_scores, ranked_truths, ranked_PIDs =\
- plot_ranked_percentages(estimator=estimator,
- pinfo=pinfo,
- label_type=label_type,
- ensemble=ensemble,
- output_csv=output_csv)
- else:
- message = ('{} is not a valid scoring method!').format(str(scores))
- raise WORCKeyError(message)
-
- if output_zip is not None:
- # Convert to lower to later on overcome matching errors
- ranked_PIDs = [i.lower() for i in ranked_PIDs]
-
- if images:
- plot_ranked_images(pinfo=pinfo,
- label_type=label_type,
- images=images,
- segmentations=segmentations,
- ranked_truths=ranked_truths,
- ranked_scores=ranked_scores,
- ranked_PIDs=ranked_PIDs,
- output_zip=output_zip,
- scores=scores)
- else:
- # Make dummy
- if output_zip is not None:
- zipfile.ZipFile(output_zip,
- 'w', zipfile.ZIP_DEFLATED, allowZip64=True)
-
-
if __name__ == '__main__':
main()
diff --git a/WORC/doc/_build/html/_modules/WORC/plotting/scatterplot.html b/WORC/doc/_build/html/_modules/WORC/plotting/scatterplot.html
index 1548d97c..a4e5e48c 100644
--- a/WORC/doc/_build/html/_modules/WORC/plotting/scatterplot.html
+++ b/WORC/doc/_build/html/_modules/WORC/plotting/scatterplot.html
@@ -8,7 +8,7 @@
- WORC.plotting.scatterplot — WORC 3.5.0 documentation
+ WORC.plotting.scatterplot — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/processing/ExtractNLargestBlobsn.html b/WORC/doc/_build/html/_modules/WORC/processing/ExtractNLargestBlobsn.html
index 8e42e888..85f664f4 100644
--- a/WORC/doc/_build/html/_modules/WORC/processing/ExtractNLargestBlobsn.html
+++ b/WORC/doc/_build/html/_modules/WORC/processing/ExtractNLargestBlobsn.html
@@ -8,7 +8,7 @@
- WORC.processing.ExtractNLargestBlobsn — WORC 3.5.0 documentation
+ WORC.processing.ExtractNLargestBlobsn — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/processing/classes.html b/WORC/doc/_build/html/_modules/WORC/processing/classes.html
index d0ab7331..ca61bd99 100644
--- a/WORC/doc/_build/html/_modules/WORC/processing/classes.html
+++ b/WORC/doc/_build/html/_modules/WORC/processing/classes.html
@@ -8,7 +8,7 @@
- WORC.processing.classes — WORC 3.5.0 documentation
+ WORC.processing.classes — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/processing/label_processing.html b/WORC/doc/_build/html/_modules/WORC/processing/label_processing.html
index b18156c8..f838fdbb 100644
--- a/WORC/doc/_build/html/_modules/WORC/processing/label_processing.html
+++ b/WORC/doc/_build/html/_modules/WORC/processing/label_processing.html
@@ -8,7 +8,7 @@
- WORC.processing.label_processing — WORC 3.5.0 documentation
+ WORC.processing.label_processing — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -288,7 +288,7 @@ Source code for WORC.processing.label_processing
label_status (numpy array): The status of the different labels
for each patient
"""
- data = pd.read_csv(input_file, sep=None, header=0)
+ data = pd.read_csv(input_file, sep=None, header=0, engine='python')
# Load and check the header
header = data.keys()
diff --git a/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/CalcFeatures_test.html b/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/CalcFeatures_test.html
index c622b99b..207207a2 100644
--- a/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/CalcFeatures_test.html
+++ b/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/CalcFeatures_test.html
@@ -8,7 +8,7 @@
- WORC.resources.fastr_tests.CalcFeatures_test — WORC 3.5.0 documentation
+ WORC.resources.fastr_tests.CalcFeatures_test — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/elastix_test.html b/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/elastix_test.html
index b7cf49e8..a1bc5350 100644
--- a/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/elastix_test.html
+++ b/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/elastix_test.html
@@ -8,7 +8,7 @@
- WORC.resources.fastr_tests.elastix_test — WORC 3.5.0 documentation
+ WORC.resources.fastr_tests.elastix_test — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/segmentix_test.html b/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/segmentix_test.html
index ab190e36..4b3701bf 100644
--- a/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/segmentix_test.html
+++ b/WORC/doc/_build/html/_modules/WORC/resources/fastr_tests/segmentix_test.html
@@ -8,7 +8,7 @@
- WORC.resources.fastr_tests.segmentix_test — WORC 3.5.0 documentation
+ WORC.resources.fastr_tests.segmentix_test — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/tools/Elastix.html b/WORC/doc/_build/html/_modules/WORC/tools/Elastix.html
index 8edc4f5e..f7ebc124 100644
--- a/WORC/doc/_build/html/_modules/WORC/tools/Elastix.html
+++ b/WORC/doc/_build/html/_modules/WORC/tools/Elastix.html
@@ -8,7 +8,7 @@
- WORC.tools.Elastix — WORC 3.5.0 documentation
+ WORC.tools.Elastix — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/tools/Evaluate.html b/WORC/doc/_build/html/_modules/WORC/tools/Evaluate.html
index f69b4016..bc8a8bc4 100644
--- a/WORC/doc/_build/html/_modules/WORC/tools/Evaluate.html
+++ b/WORC/doc/_build/html/_modules/WORC/tools/Evaluate.html
@@ -8,7 +8,7 @@
- WORC.tools.Evaluate — WORC 3.5.0 documentation
+ WORC.tools.Evaluate — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -190,8 +190,10 @@ Source code for WORC.tools.Evaluate
[docs]class Evaluate(object):
"""Build a network that evaluates the performance of an estimator."""
-[docs] def __init__(self, label_type, modus='binary_classification', ensemble=50,
+[docs] def __init__(self, label_type, modus='binary_classification',
scores='percentages',
+ ensemble_method='top_N',
+ ensemble_size=100,
parent=None, features=None,
fastr_plugin='LinearExecution',
name='Example'):
@@ -210,14 +212,14 @@ Source code for WORC.tools.Evaluate
self.network = parent.network
self.mode = 'WORC'
self.name = parent.network.id
- self.ensemble = parent.configs[0]['Ensemble']['Use']
else:
self.mode = 'StandAlone'
self.fastr_plugin = fastr_plugin
self.name = 'WORC_Evaluate_' + name
self.network = fastr.create_network(id=self.name)
self.fastr_tmpdir = os.path.join(fastr.config.mounts['tmp'], self.name)
- self.ensemble = ensemble
+ self.ensemble_method = ensemble_method
+ self.ensemble_size = ensemble_size
if features is None and self.mode == 'StandAlone':
raise WORCexceptions.WORCIOError('Either features as input or a WORC network is required for the Evaluate network.')
@@ -421,9 +423,14 @@ Source code for WORC.tools.Evaluate
self.network.create_constant('String', [self.label_type],
id='LabelType',
step_id='Evaluation')
- self.source_Ensemble =\
- self.network.create_constant('String', [self.ensemble],
- id='Ensemble',
+ self.source_ensemble_method =\
+ self.network.create_constant('String', [self.ensemble_method],
+ id='ensemble_method',
+ step_id='Evaluation')
+
+ self.source_ensemble_size =\
+ self.network.create_constant('String', [self.ensemble_size],
+ id='ensemble_size',
step_id='Evaluation')
# Create sources if not supplied by a WORC network
@@ -509,26 +516,32 @@ Source code for WORC.tools.Evaluate
self.link_segmentations_post.collapse = 'patients'
if self.modus == 'binary_classification':
- self.node_ROC.inputs['ensemble'] = self.source_Ensemble.output
+ self.node_ROC.inputs['ensemble_method'] = self.source_ensemble_method.output
+ self.node_ROC.inputs['ensemble_size'] = self.source_ensemble_size.output
self.node_ROC.inputs['label_type'] = self.source_LabelType.output
if 'classification' in self.modus:
- self.node_Ranked_Percentages.inputs['ensemble'] =\
- self.source_Ensemble.output
+ self.node_Ranked_Percentages.inputs['ensemble_method'] =\
+ self.source_ensemble_method.output
+ self.node_Ranked_Percentages.inputs['ensemble_size'] =\
+ self.source_ensemble_size.output
self.node_Ranked_Percentages.inputs['label_type'] =\
self.source_LabelType.output
- self.node_Estimator.inputs['ensemble'] = self.source_Ensemble.output
+ self.node_Estimator.inputs['ensemble_method'] = self.source_ensemble_method.output
+ self.node_Estimator.inputs['ensemble_size'] = self.source_ensemble_size.output
self.node_Estimator.inputs['label_type'] = self.source_LabelType.output
- self.node_Barchart.inputs['estimators'] = self.source_Ensemble.output
+ self.node_Barchart.inputs['estimators'] = self.source_ensemble_size.output
self.node_Barchart.inputs['label_type'] = self.source_LabelType.output
- self.node_Hyperparameters.inputs['estimators'] = self.source_Ensemble.output
+ self.node_Hyperparameters.inputs['estimators'] = self.source_ensemble_size.output
self.node_Hyperparameters.inputs['label_type'] = self.source_LabelType.output
- self.node_Ranked_Posteriors.inputs['ensemble'] =\
- self.source_Ensemble.output
+ self.node_Ranked_Posteriors.inputs['ensemble_method'] =\
+ self.source_ensemble_method.output
+ self.node_Ranked_Posteriors.inputs['ensemble_size'] =\
+ self.source_ensemble_size.output
self.node_Ranked_Posteriors.inputs['label_type'] =\
self.source_LabelType.output
@@ -541,7 +554,10 @@ Source code for WORC.tools.Evaluate
else:
pinfo = self.parent.source_patientclass_train.output
- config = self.parent.source_class_config.output
+ if self.parent.configs[0]['General']['Fingerprint'] == 'True':
+ config = self.parent.node_fingerprinters['classification'].outputs['config']
+ else:
+ config = self.parent.source_class_config.output
if hasattr(self.parent, 'sources_images_train'):
if self.parent.sources_images_train:
@@ -552,23 +568,28 @@ Source code for WORC.tools.Evaluate
self.parent.sources_segmentations_train[label].output
if self.modus == 'binary_classification':
- self.node_ROC.inputs['ensemble'] = self.parent.source_Ensemble.output
+ self.node_ROC.inputs['ensemble_method'] = self.parent.source_ensemble_method.output
+ self.node_ROC.inputs['ensemble_size'] = self.parent.source_ensemble_size.output
self.node_ROC.inputs['label_type'] = self.parent.source_LabelType.output
if 'classification' in self.modus:
- self.node_Ranked_Percentages.inputs['ensemble'] =\
- self.parent.source_Ensemble.output
+ self.node_Ranked_Percentages.inputs['ensemble_method'] =\
+ self.parent.source_ensemble_method.output
+ self.node_Ranked_Percentages.inputs['ensemble_size'] =\
+ self.parent.source_ensemble_size.output
self.node_Ranked_Percentages.inputs['label_type'] =\
self.parent.source_LabelType.output
- self.node_Barchart.inputs['estimators'] = self.parent.source_Ensemble.output
+ self.node_Barchart.inputs['estimators'] = self.parent.source_ensemble_size.output
self.node_Barchart.inputs['label_type'] = self.parent.source_LabelType.output
- self.node_Hyperparameters.inputs['estimators'] = self.parent.source_Ensemble.output
+ self.node_Hyperparameters.inputs['estimators'] = self.parent.source_ensemble_size.output
self.node_Hyperparameters.inputs['label_type'] = self.parent.source_LabelType.output
- self.node_Ranked_Posteriors.inputs['ensemble'] =\
- self.parent.source_Ensemble.output
+ self.node_Ranked_Posteriors.inputs['ensemble_method'] =\
+ self.parent.source_ensemble_method.output
+ self.node_Ranked_Posteriors.inputs['ensemble_size'] =\
+ self.parent.source_ensemble_size.output
self.node_Ranked_Posteriors.inputs['label_type'] =\
self.parent.source_LabelType.output
@@ -739,7 +760,8 @@ Source code for WORC.tools.Evaluate
self.source_data['Segmentations'] = segmentations
self.source_data['Config'] = config
self.source_data['LabelType'] = self.label_type
- self.source_data['Ensemble'] = self.ensemble
+ self.source_data['ensemble_method'] = self.ensemble_method
+ self.source_data['ensemble_size'] = self.ensemble_size
for feature, label in zip(features, self.labels):
self.source_data[label] = feature
diff --git a/WORC/doc/_build/html/_modules/WORC/tools/Slicer.html b/WORC/doc/_build/html/_modules/WORC/tools/Slicer.html
index 6ba8c334..d64d783e 100644
--- a/WORC/doc/_build/html/_modules/WORC/tools/Slicer.html
+++ b/WORC/doc/_build/html/_modules/WORC/tools/Slicer.html
@@ -8,7 +8,7 @@
- WORC.tools.Slicer — WORC 3.5.0 documentation
+ WORC.tools.Slicer — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/tools/Transformix.html b/WORC/doc/_build/html/_modules/WORC/tools/Transformix.html
index de97124c..a558fb96 100644
--- a/WORC/doc/_build/html/_modules/WORC/tools/Transformix.html
+++ b/WORC/doc/_build/html/_modules/WORC/tools/Transformix.html
@@ -8,7 +8,7 @@
- WORC.tools.Transformix — WORC 3.5.0 documentation
+ WORC.tools.Transformix — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/WORC/tools/createfixedsplits.html b/WORC/doc/_build/html/_modules/WORC/tools/createfixedsplits.html
index ad86b30a..3717fd6f 100644
--- a/WORC/doc/_build/html/_modules/WORC/tools/createfixedsplits.html
+++ b/WORC/doc/_build/html/_modules/WORC/tools/createfixedsplits.html
@@ -8,7 +8,7 @@
- WORC.tools.createfixedsplits — WORC 3.5.0 documentation
+ WORC.tools.createfixedsplits — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/_modules/index.html b/WORC/doc/_build/html/_modules/index.html
index b928ae95..f97bcc63 100644
--- a/WORC/doc/_build/html/_modules/index.html
+++ b/WORC/doc/_build/html/_modules/index.html
@@ -8,7 +8,7 @@
- Overview: module code — WORC 3.5.0 documentation
+ Overview: module code — WORC 3.6.0 documentation
@@ -62,7 +62,7 @@
- 3.5.0
+ 3.6.0
@@ -181,6 +181,7 @@ All modules for which code is available
WORC.classification.fitandscore
WORC.classification.metrics
WORC.classification.parameter_optimization
+WORC.classification.smac
WORC.classification.trainclassifier
WORC.detectors.detectors
WORC.exampledata.create_example_data
@@ -244,6 +245,7 @@ All modules for which code is available
WORC.tools.Slicer
WORC.tools.Transformix
WORC.tools.createfixedsplits
+WORC.tools.fingerprinting
WORC.validators.preflightcheck
diff --git a/WORC/doc/_build/html/_sources/autogen/WORC.classification.rst.txt b/WORC/doc/_build/html/_sources/autogen/WORC.classification.rst.txt
index 1f358e9b..49bc90f1 100644
--- a/WORC/doc/_build/html/_sources/autogen/WORC.classification.rst.txt
+++ b/WORC/doc/_build/html/_sources/autogen/WORC.classification.rst.txt
@@ -100,6 +100,15 @@ classification Package
:show-inheritance:
:special-members:
+:mod:`smac` Module
+------------------
+
+.. automodule:: WORC.classification.smac
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :special-members:
+
:mod:`trainclassifier` Module
-----------------------------
diff --git a/WORC/doc/_build/html/_sources/autogen/WORC.config.rst.txt b/WORC/doc/_build/html/_sources/autogen/WORC.config.rst.txt
index c6c0f742..e6fb47dd 100644
--- a/WORC/doc/_build/html/_sources/autogen/WORC.config.rst.txt
+++ b/WORC/doc/_build/html/_sources/autogen/WORC.config.rst.txt
@@ -10,6 +10,7 @@ Evaluation :ref:`Evaluation `
FeatPreProcess :ref:`FeatPreProcess `
Featsel :ref:`Featsel `
FeatureScaling :ref:`FeatureScaling `
+Fingerprinting :ref:`Fingerprinting `
General :ref:`General `
HyperOptimization :ref:`HyperOptimization `
ImageFeatures :ref:`ImageFeatures `
@@ -19,6 +20,7 @@ OneHotEncoding :ref:`OneHotEncoding `
Preprocessing :ref:`Preprocessing `
PyRadiomics :ref:`PyRadiomics `
Resampling :ref:`Resampling `
+SMAC :ref:`SMAC `
Segmentix :ref:`Segmentix `
SelectFeatGroup :ref:`SelectFeatGroup `
================= ===================================================
\ No newline at end of file
diff --git a/WORC/doc/_build/html/_sources/autogen/WORC.rst.txt b/WORC/doc/_build/html/_sources/autogen/WORC.rst.txt
index 6872a9c8..02387248 100644
--- a/WORC/doc/_build/html/_sources/autogen/WORC.rst.txt
+++ b/WORC/doc/_build/html/_sources/autogen/WORC.rst.txt
@@ -47,5 +47,6 @@ Subpackages
WORC.statistics
WORC.tests
WORC.tools
+ WORC.tutorial
WORC.validators
diff --git a/WORC/doc/_build/html/_sources/autogen/WORC.tools.rst.txt b/WORC/doc/_build/html/_sources/autogen/WORC.tools.rst.txt
index f9059a22..786d8f28 100644
--- a/WORC/doc/_build/html/_sources/autogen/WORC.tools.rst.txt
+++ b/WORC/doc/_build/html/_sources/autogen/WORC.tools.rst.txt
@@ -55,3 +55,12 @@ tools Package
:show-inheritance:
:special-members:
+:mod:`fingerprinting` Module
+----------------------------
+
+.. automodule:: WORC.tools.fingerprinting
+ :members:
+ :undoc-members:
+ :show-inheritance:
+ :special-members:
+
diff --git a/WORC/doc/_build/html/_sources/static/configuration.rst.txt b/WORC/doc/_build/html/_sources/static/configuration.rst.txt
index 261b5fc6..0ad52d55 100644
--- a/WORC/doc/_build/html/_sources/static/configuration.rst.txt
+++ b/WORC/doc/_build/html/_sources/static/configuration.rst.txt
@@ -403,6 +403,23 @@ Specify the hyperparameter optimization procedure here.
.. include:: ../autogen/config/WORC.config_HyperOptimization_defopts.rst
+.. _config-SMAC:
+
+SMAC
+~~~~
+WORC enables the use of the SMAC algorithm for the hyperparameter optimization.
+SMAC uses the same parameter options as the default random search, except for
+resampling which is currently not compatible with SMAC.
+
+**Description:**
+
+.. include:: ../autogen/config/WORC.config_SMAC_description.rst
+
+**Defaults and Options:**
+
+.. include:: ../autogen/config/WORC.config_SMAC_defopts.rst
+
+
.. _config-Ensemble:
Ensemble
diff --git a/WORC/doc/_build/html/_sources/static/quick_start.rst.txt b/WORC/doc/_build/html/_sources/static/quick_start.rst.txt
index a1cca080..1289e11b 100644
--- a/WORC/doc/_build/html/_sources/static/quick_start.rst.txt
+++ b/WORC/doc/_build/html/_sources/static/quick_start.rst.txt
@@ -183,6 +183,11 @@ After defining the inputs, the following code can be used to run your first expe
experiment.labels_from_this_file(label_file)
experiment.predict_labels(label_name)
+ # Set the types of images WORC has to process. Used in fingerprinting
+ # Valid quantitative types are ['CT', 'PET', 'Thermography', 'ADC']
+ # Valid qualitative types are ['MRI', 'DWI', 'US']
+ experiment.set_image_types(['CT'])
+
# Use the standard workflow for your specific modus
if modus == 'binary_classification':
experiment.binary_classification(coarse=coarse)
diff --git a/WORC/doc/_build/html/_static/documentation_options.js b/WORC/doc/_build/html/_static/documentation_options.js
index 1d0981c2..9ddc46e0 100644
--- a/WORC/doc/_build/html/_static/documentation_options.js
+++ b/WORC/doc/_build/html/_static/documentation_options.js
@@ -1,6 +1,6 @@
var DOCUMENTATION_OPTIONS = {
URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'),
- VERSION: '3.5.0',
+ VERSION: '3.6.0',
LANGUAGE: 'None',
COLLAPSE_INDEX: false,
FILE_SUFFIX: '.html',
diff --git a/WORC/doc/_build/html/autogen/WORC.IOparser.html b/WORC/doc/_build/html/autogen/WORC.IOparser.html
index 120cb57b..c5c4a3c8 100644
--- a/WORC/doc/_build/html/autogen/WORC.IOparser.html
+++ b/WORC/doc/_build/html/autogen/WORC.IOparser.html
@@ -8,7 +8,7 @@
- IOparser Package — WORC 3.5.0 documentation
+ IOparser Package — WORC 3.6.0 documentation
@@ -64,7 +64,7 @@
- 3.5.0
+ 3.6.0
diff --git a/WORC/doc/_build/html/autogen/WORC.classification.html b/WORC/doc/_build/html/autogen/WORC.classification.html
index 92008c45..68d92d4e 100644
--- a/WORC/doc/_build/html/autogen/WORC.classification.html
+++ b/WORC/doc/_build/html/autogen/WORC.classification.html
@@ -8,7 +8,7 @@
- classification Package — WORC 3.5.0 documentation
+ classification Package — WORC 3.6.0 documentation
@@ -64,7 +64,7 @@
- 3.5.0
+ 3.6.0
@@ -119,6 +119,7 @@
metrics
Module
parameter_optimization
Module
regressors
Module
+smac
Module
trainclassifier
Module
@@ -540,7 +541,7 @@
SearchCV
ModuleΒΆ
-
-class
WORC.classification.SearchCV.
BaseSearchCV
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False)[source]ΒΆ
+class WORC.classification.SearchCV.
BaseSearchCV
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False, ensemble_validation_score=None)[source]ΒΆ
Bases: sklearn.base.BaseEstimator
, sklearn.base.MetaEstimatorMixin
Base class for hyper parameter search with cross-validation.
@@ -550,7 +551,7 @@
-
-
__init__
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False)[source]ΒΆ
+__init__
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False, ensemble_validation_score=None)[source]ΒΆ
Initialize SearchCV Object.
@@ -561,30 +562,34 @@
-
-
create_ensemble
(X_train, Y_train, verbose=None, initialize=True, scoring=None, method=50, overfit_scaler=False)[source]ΒΆ
-Create ensemble of multiple workflows.
-Create an (optimal) ensemble of a combination of hyperparameter settings
+create_ensemble
(X_train, Y_train, verbose=None, initialize=False, scoring=None, method='top_N', size=50, overfit_scaler=False)[source]ΒΆ
+
Create an (optimal) ensemble of a combination of hyperparameter settings
and the associated groupsels, PCAs, estimators etc.
-Based on Caruana et al. 2004, but a little different:
-
-Recreate the training/validation splits for a n-fold cross validation.
-
-- For each fold:
-Start with an empty ensemble
-Create starting ensemble by adding N individually best performing
-models on the validation set. N is tuned on the validation set.
-Add model that improves ensemble performance on validation set the most, with replacement.
-Repeat (c) untill performance does not increase
-
+
+- # The following ensemble methods are supported:
# Single:
+# only use the single best classifier. Performance is computed
+# using the same predict function as during the optimization
+# top_N:
+# make an ensemble of the best N individual classifiers, where N is
+# given as an input. If N==1, then only the single best classifier is
+# used, but it is evaluated using predict_proba.
+# FitNumber:
+# make an ensemble of the best N individual classifiers, choosing N
+# that gives the highest performance
+# ForwardSelection:
+# add the model that optimizes the total ensemble performance,
+# then repeat with replacement until there is no more improvement
+# in performance
+# Caruana:
+# for a fixed number of iterations, add the model that optimizes
+# the total ensemble performance, then choose the ensemble size
+# which gave the best performance
+# Bagging:
+# same as Caruana method, but the final ensemble is a weighted average
+# of a number of ensembles that each use only a subset of the available
+# models
-
-
-The performance metric is the same as for the original hyperparameter
-search, i.e. probably the F1-score for classification and r2-score
-for regression. However, we recommend using the SAR score, as this is
-more universal.
-Method: top50 or Caruana
@@ -660,7 +665,7 @@
-
-
process_fit
(n_splits, parameters_all, test_sample_counts, test_score_dicts, train_score_dicts, fit_time, score_time, cv_iter, X, y, fitted_workflows=None)[source]ΒΆ
+process_fit
(n_splits, parameters_all, test_sample_counts, test_score_dicts, train_score_dicts, fit_time, score_time, cv_iter, X, y, fitted_workflows=None, use_smac=False)[source]ΒΆ
Process a fit.
Process the outcomes of a SearchCV fit and find the best settings
over all cross validations from all hyperparameters tested
@@ -722,7 +727,7 @@
-
-class
WORC.classification.SearchCV.
BaseSearchCVJoblib
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False)[source]ΒΆ
+class WORC.classification.SearchCV.
BaseSearchCVJoblib
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False, ensemble_validation_score=None)[source]ΒΆ
Bases: WORC.classification.SearchCV.BaseSearchCV
Base class for hyper parameter search with cross-validation.
@@ -737,9 +742,26 @@
+
+-
+class
WORC.classification.SearchCV.
BaseSearchCVSMAC
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False, ensemble_validation_score=None)[source]ΒΆ
+Bases: WORC.classification.SearchCV.BaseSearchCV
+Base class for Bayesian hyper parameter search with cross-validation.
+
+-
+
__abstractmethods__
= frozenset({'__init__'})ΒΆ
+
+
+
+-
+
__module__
= 'WORC.classification.SearchCV'ΒΆ
+
+
+
+
-
-class
WORC.classification.SearchCV.
BaseSearchCVfastr
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False)[source]ΒΆ
+class WORC.classification.SearchCV.
BaseSearchCVfastr
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, fastr_plugin=None, memory='2G', ranking_score='test_score', refit_workflows=False, ensemble_validation_score=None)[source]ΒΆ
Bases: WORC.classification.SearchCV.BaseSearchCV
Base class for hyper parameter search with cross-validation.
@@ -1393,6 +1415,236 @@
+
+-
+class
WORC.classification.SearchCV.
GuidedSearchCVSMAC
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, fastr_plugin=None, maxlen=100, ranking_score='test_score', features=None, labels=None, smac_result_file=None)[source]ΒΆ
+Bases: WORC.classification.SearchCV.BaseSearchCVSMAC
+Guided search on hyperparameters.
+GuidedSearchCV implements a βfitβ and a βscoreβ method.
+It also implements βpredictβ, βpredict_probaβ, βdecision_functionβ,
+βtransformβ and βinverse_transformβ if they are implemented in the
+estimator used.
+The parameters of the estimator used to apply these methods are optimized
+by cross-validated search over parameter settings.
+The optimization is performed using the Sequential Model-based Algorithm
+Configuration (SMAC) method. A probabilistic model of the objective function
+is constructed and updated with each function evaluation.
+If all parameters are presented as a list,
+sampling without replacement is performed. If at least one parameter
+is given as a distribution, sampling with replacement is used.
+It is highly recommended to use continuous distributions for continuous
+parameters.
+
+- param_distributionsdict
Dictionary with parameter names (string) as keys and details of their
+domains as values. From this dictionary the complete search space
+will later be constructed.
+
+- n_iterint, default=10
Number of function evaluations allowed in each optimization sequence
+of SMAC.
+
+- scoringstring, callable or None, default=None
A string (see model evaluation documentation) or
+a scorer callable object / function with signature
+scorer(estimator, X, y)
.
+If None
, the score
method of the estimator is used.
+
+- fit_paramsdict, optional
Parameters to pass to the fit method.
+
+- n_jobsint, default=1
Number of jobs to run in parallel.
+
+- pre_dispatchint, or string, optional
Controls the number of jobs that get dispatched during parallel
+execution. Reducing this number can be useful to avoid an
+explosion of memory consumption when more jobs get dispatched
+than CPUs can process. This parameter can be:
+
+
+None, in which case all the jobs are immediately
+created and spawned. Use this for lightweight and
+fast-running jobs, to avoid delays due to on-demand
+spawning of the jobs
+An int, giving the exact number of total jobs that are
+spawned
+A string, giving an expression as a function of n_jobs,
+as in β2*n_jobsβ
+
+
+
+- iidboolean, default=True
If True, the data is assumed to be identically distributed across
+the folds, and the loss minimized is the total loss per sample,
+and not the mean loss across the folds.
+
+- cvint, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
+Possible inputs for cv are:
+
+
+None, to use the default 3-fold cross validation,
+integer, to specify the number of folds in a (Stratified)KFold,
+An object to be used as a cross-validation generator.
+An iterable yielding train, test splits.
+
+
+For integer/None inputs, if the estimator is a classifier and y
is
+either binary or multiclass, StratifiedKFold
is used. In all
+other cases, KFold
is used.
+Refer User Guide for the various
+cross-validation strategies that can be used here.
+
+- refitboolean, default=True
Refit the best estimator with the entire dataset.
+If βFalseβ, it is impossible to make predictions using
+this RandomizedSearchCV instance after fitting.
+
+- verboseinteger
Controls the verbosity: the higher, the more messages.
+
+- random_stateint or RandomState
Pseudo random number generator state used for random uniform sampling
+from lists of possible values instead of scipy.stats distributions.
+
+- error_scoreβraiseβ (default) or numeric
Value to assign to the score if an error occurs in estimator fitting.
+If set to βraiseβ, the error is raised. If a numeric value is given,
+FitFailedWarning is raised. This parameter does not affect the refit
+step, which will always raise the error.
+
+- return_train_scoreboolean, default=True
If 'False'
, the cv_results_
attribute will not include training
+scores.
+
+
+
+- cv_results_dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
+imported into a pandas DataFrame
.
+For instance the below given table
+
+
+
+
+
+
+
+
+
+param_kernel
+param_gamma
+split0_test_score
+β¦
+rank_test_score
+
+
+
+βrbfβ
+0.1
+0.8
+β¦
+2
+
+βrbfβ
+0.2
+0.9
+β¦
+1
+
+βrbfβ
+0.3
+0.7
+β¦
+1
+
+
+
+will be represented by a cv_results_
dict of:
+{
+'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
+ mask = False),
+'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False),
+'split0_test_score' : [0.8, 0.9, 0.7],
+'split1_test_score' : [0.82, 0.5, 0.7],
+'mean_test_score' : [0.81, 0.7, 0.7],
+'std_test_score' : [0.02, 0.2, 0.],
+'rank_test_score' : [3, 1, 1],
+'split0_train_score' : [0.8, 0.9, 0.7],
+'split1_train_score' : [0.82, 0.5, 0.7],
+'mean_train_score' : [0.81, 0.7, 0.7],
+'std_train_score' : [0.03, 0.03, 0.04],
+'mean_fit_time' : [0.73, 0.63, 0.43, 0.49],
+'std_fit_time' : [0.01, 0.02, 0.01, 0.01],
+'mean_score_time' : [0.007, 0.06, 0.04, 0.04],
+'std_score_time' : [0.001, 0.002, 0.003, 0.005],
+'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
+}
+
+
+NOTE that the key 'params'
is used to store a list of parameter
+settings dict for all the parameter candidates.
+The mean_fit_time
, std_fit_time
, mean_score_time
and
+std_score_time
are all in seconds.
+
+- best_estimator_estimator
Estimator that was chosen by the search, i.e. estimator
+which gave highest score (or smallest loss if specified)
+on the left out data. Not available if refit=False.
+
+- best_score_float
Score of best_estimator on the left out data.
+
+- best_params_dict
Parameter setting that gave the best results on the hold out data.
+
+- best_index_int
The index (of the cv_results_
arrays) which corresponds to the best
+candidate parameter setting.
+The dict at search.cv_results_['params'][search.best_index_]
gives
+the parameter setting for the best model, that gives the highest
+mean score (search.best_score_
).
+
+- scorer_function
Scorer function used on the held out data to choose the best
+parameters for the model.
+
+- n_splits_int
The number of cross-validation splits (folds/iterations).
+
+
+The parameters selected are those that maximize the score of the held-out
+data, according to the scoring parameter.
+If n_jobs was set to a value higher than one, the data is copied for each
+parameter setting(and not n_jobs times). This is done for efficiency
+reasons if individual jobs take very little time, but may raise errors if
+the dataset is large and not enough memory is available. A workaround in
+this case is to set pre_dispatch. Then, the memory is copied only
+pre_dispatch many times. A reasonable value for pre_dispatch is 2 *
+n_jobs.
+
+GridSearchCV
:Does exhaustive search over a grid of parameters.
+
+ParameterSampler
:A generator over parameter settings, constructed from
+param_distributions.
+
+
+
+-
+
__abstractmethods__
= frozenset({})ΒΆ
+
+
+
+-
+
__init__
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, fastr_plugin=None, maxlen=100, ranking_score='test_score', features=None, labels=None, smac_result_file=None)[source]ΒΆ
+Initialize SearchCV Object.
+
+
+
+-
+
__module__
= 'WORC.classification.SearchCV'ΒΆ
+
+
+
+-
+
fit
(X, y=None, groups=None)[source]ΒΆ
+Run fit on the estimator with randomly drawn parameters.
+
+- Xarray-like, shape = [n_samples, n_features]
Training vector, where n_samples in the number of samples and
+n_features is the number of features.
+
+- yarray-like, shape = [n_samples] or [n_samples, n_output], optional
Target relative to X for classification or regression;
+None for unsupervised learning.
+
+- groupsarray-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
+train/test set.
+
+
+
+
+
+
-
class
WORC.classification.SearchCV.
RandomizedSearchCVJoblib
(param_distributions={}, n_iter=10, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True, n_jobspercore=100, maxlen=100, ranking_score='test_score')[source]ΒΆ
@@ -1493,7 +1745,7 @@
-- cv_results_dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
+
- cv_results_dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas DataFrame
.
For instance the below given table
@@ -1560,24 +1812,24 @@
The mean_fit_time
, std_fit_time
, mean_score_time
and
std_score_time
are all in seconds.
-- best_estimator_estimator
Estimator that was chosen by the search, i.e. estimator
+
- best_estimator_estimator
Estimator that was chosen by the search, i.e. estimator
which gave highest score (or smallest loss if specified)
on the left out data. Not available if refit=False.
-- best_score_float
Score of best_estimator on the left out data.
+- best_score_float
Score of best_estimator on the left out data.
-- best_params_dict
Parameter setting that gave the best results on the hold out data.
+- best_params_dict
Parameter setting that gave the best results on the hold out data.
-- best_index_int
The index (of the cv_results_
arrays) which corresponds to the best
+
- best_index_int
The index (of the cv_results_
arrays) which corresponds to the best
candidate parameter setting.
The dict at search.cv_results_['params'][search.best_index_]
gives
the parameter setting for the best model, that gives the highest
mean score (search.best_score_
).
-- scorer_function
Scorer function used on the held out data to choose the best
+
- scorer_function
Scorer function used on the held out data to choose the best
parameters for the model.
-- n_splits_int
The number of cross-validation splits (folds/iterations).
+- n_splits_int
The number of cross-validation splits (folds/iterations).
The parameters selected are those that maximize the score of the held-out
@@ -1731,7 +1983,7 @@
-- cv_results_dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
+
- cv_results_dict of numpy (masked) ndarrays
A dict with keys as column headers and values as columns, that can be
imported into a pandas DataFrame
.
For instance the below given table
@@ -1798,24 +2050,24 @@
The mean_fit_time
, std_fit_time
, mean_score_time
and
std_score_time
are all in seconds.
-- best_estimator_estimator
Estimator that was chosen by the search, i.e. estimator
+
- best_estimator_estimator
Estimator that was chosen by the search, i.e. estimator
which gave highest score (or smallest loss if specified)
on the left out data. Not available if refit=False.
-- best_score_float
Score of best_estimator on the left out data.
+- best_score_float
Score of best_estimator on the left out data.
-- best_params_dict
Parameter setting that gave the best results on the hold out data.
+- best_params_dict
Parameter setting that gave the best results on the hold out data.
-- best_index_int
The index (of the cv_results_
arrays) which corresponds to the best
+
- best_index_int
The index (of the cv_results_
arrays) which corresponds to the best
candidate parameter setting.
The dict at search.cv_results_['params'][search.best_index_]
gives
the parameter setting for the best model, that gives the highest
mean score (search.best_score_
).
-- scorer_function
Scorer function used on the held out data to choose the best
+
- scorer_function
Scorer function used on the held out data to choose the best
parameters for the model.
-- n_splits_int
The number of cross-validation splits (folds/iterations).
+- n_splits_int
The number of cross-validation splits (folds/iterations).
The parameters selected are those that maximize the score of the held-out
@@ -1950,14 +2202,14 @@
crossval
ModuleΒΆ
-
-
WORC.classification.crossval.
LOO_cross_validation
(image_features, feature_labels, classes, patient_ids, param_grid, config, modus, test_size, start=0, save_data=None, tempsave=False, tempfolder=None, fixedsplits=None, fixed_seed=False, use_fastr=None, fastr_plugin=None)[source]ΒΆ
+WORC.classification.crossval.
LOO_cross_validation
(image_features, feature_labels, classes, patient_ids, param_grid, config, modus, test_size, start=0, save_data=None, tempsave=False, tempfolder=None, fixedsplits=None, fixed_seed=False, use_fastr=None, fastr_plugin=None, use_SMAC=False, smac_result_file=None)[source]ΒΆ
Cross-validation in which each sample is once used as the test set.
Mostly based on the default sklearn object.
-
-
WORC.classification.crossval.
crossval
(config, label_data, image_features, param_grid=None, use_fastr=False, fastr_plugin=None, tempsave=False, fixedsplits=None, ensemble={'Use': False}, outputfolder=None, modus='singlelabel')[source]ΒΆ
+WORC.classification.crossval.
crossval
(config, label_data, image_features, param_grid=None, use_fastr=False, fastr_plugin=None, tempsave=False, fixedsplits=None, ensemble={'Use': False}, outputfolder=None, modus='singlelabel', use_SMAC=False, smac_result_file=None)[source]ΒΆ
Constructs multiple individual classifiers based on the label settings.
- config: dict, mandatory
Dictionary with config settings. See the Github Wiki for the
@@ -2053,7 +2305,7 @@