From 84af094113745e69b37b8f2a902f4dc99ee555d1 Mon Sep 17 00:00:00 2001 From: Florence Townend Date: Wed, 15 May 2024 13:52:54 +0200 Subject: [PATCH] added documentation for GPU on customising training section. Currently not implemented for subspace methods but that is on the list --- docs/customising_training.rst | 33 +++ fusilli/data.py | 237 ++++++++++-------- .../fusionmodels/tabularfusion/mcvae_model.py | 17 +- 3 files changed, 176 insertions(+), 111 deletions(-) diff --git a/docs/customising_training.rst b/docs/customising_training.rst index 5a455c4..f24cfdd 100644 --- a/docs/customising_training.rst +++ b/docs/customising_training.rst @@ -5,6 +5,7 @@ This page will show you how to customise the training and evaluation of your fus We will cover the following topics: +* Using GPU * Early stopping * Valildation metrics * Batch size @@ -13,6 +14,38 @@ We will cover the following topics: * Number of workers in PyTorch DataLoader * Train/test and cross-validation splitting yourself +Using GPU +------------ + +If you want to use a GPU to train your model, you can pass the ``training_modifications`` argument to the :func:`~.fusilli.data.prepare_fusion_data` and :func:`~.fusilli.train.train_and_save_models` functions. By default, the model will train on the CPU. + +For example, to train on a single GPU, you can do the following: + +.. code-block:: python + + from fusilli.data import prepare_fusion_data + from fusilli.train import train_and_save_models + + datamodule = prepare_fusion_data( + prediction_task="binary", + fusion_model=example_model, + data_paths=data_paths, + output_paths=output_path, + ) + + trained_model_list = train_and_save_models( + data_module=datamodule, + fusion_model=example_model, + training_modifications={"accelerator": "gpu", "devices": 1}, + ) + +.. warning:: + + This is currently not implemented for subspace-based models as of May 2024. + When this is implemented, the documentation will be updated. + + + Early stopping -------------- diff --git a/fusilli/data.py b/fusilli/data.py index b6c54ef..5a39f56 100644 --- a/fusilli/data.py +++ b/fusilli/data.py @@ -228,7 +228,9 @@ def __init__(self, sources, img_downsample_dims=None): if "ID" not in tab1_df.columns: raise ValueError("The CSV must have an index column named 'ID'.") if "prediction_label" not in tab1_df.columns: - raise ValueError("The CSV must have a label column named 'prediction_label'.") + raise ValueError( + "The CSV must have a label column named 'prediction_label'." + ) # if tabular2_source exists, check it has the right columns if self.tabular2_source != "": @@ -236,7 +238,9 @@ def __init__(self, sources, img_downsample_dims=None): if "ID" not in tab2_df.columns: raise ValueError("The CSV must have an index column named 'ID'.") if "prediction_label" not in tab2_df.columns: - raise ValueError("The CSV must have a label column named 'prediction_label'.") + raise ValueError( + "The CSV must have a label column named 'prediction_label'." + ) def load_tabular1(self): """ @@ -337,11 +341,17 @@ def load_tabular_tabular(self): tab1_df.set_index("ID", inplace=True) tab2_df.set_index("ID", inplace=True) - tab1_pred_features = torch.Tensor(tab1_df.drop(columns=["prediction_label"]).values) - tab2_pred_features = torch.Tensor(tab2_df.drop(columns=["prediction_label"]).values) + tab1_pred_features = torch.Tensor( + tab1_df.drop(columns=["prediction_label"]).values + ) + tab2_pred_features = torch.Tensor( + tab2_df.drop(columns=["prediction_label"]).values + ) prediction_label = tab1_df[["prediction_label"]] - dataset = CustomDataset([tab1_pred_features, tab2_pred_features], prediction_label) + dataset = CustomDataset( + [tab1_pred_features, tab2_pred_features], prediction_label + ) mod1_dim = tab1_pred_features.shape[1] mod2_dim = tab2_pred_features.shape[1] @@ -430,23 +440,23 @@ class TrainTestDataModule(pl.LightningDataModule): """ def __init__( - self, - fusion_model, - sources, - output_paths, - prediction_task, - batch_size, - test_size, - multiclass_dimensions, - subspace_method=None, - image_downsample_size=None, - layer_mods=None, - max_epochs=1000, - extra_log_string_dict=None, - own_early_stopping_callback=None, - num_workers=0, - test_indices=None, - kwargs=None, + self, + fusion_model, + sources, + output_paths, + prediction_task, + batch_size, + test_size, + multiclass_dimensions, + subspace_method=None, + image_downsample_size=None, + layer_mods=None, + max_epochs=1000, + extra_log_string_dict=None, + own_early_stopping_callback=None, + num_workers=0, + test_indices=None, + kwargs=None, ): """ Parameters @@ -539,8 +549,8 @@ def prepare_data(self): self.dataset, self.data_dims = self.modality_methods[self.modality_type]() def setup( - self, - checkpoint_path=None, + self, + checkpoint_path=None, ): """ Splits the data into train and test sets, and runs the subspace method if specified. @@ -565,30 +575,31 @@ def setup( self.dataset, [1 - self.test_size, self.test_size] ) else: - self.test_dataset = torch.utils.data.Subset( - self.dataset, self.test_indices - ) + self.test_dataset = torch.utils.data.Subset(self.dataset, self.test_indices) self.train_dataset = torch.utils.data.Subset( - self.dataset, list(set(range(len(self.dataset))) - set(self.test_indices)) + self.dataset, + list(set(range(len(self.dataset))) - set(self.test_indices)), ) if self.subspace_method is not None: # if subspace method is specified if ( - checkpoint_path is None + checkpoint_path is None ): # if no checkpoint path specified, train the subspace method self.subspace_method_train = self.subspace_method( datamodule=self, max_epochs=self.max_epochs, k=None, - train_subspace=True + train_subspace=True, ) # modify the subspace method architecture if specified if self.layer_mods is not None: - self.subspace_method_train = model_modifier.modify_model_architecture( - self.subspace_method_train, - self.layer_mods, + self.subspace_method_train = ( + model_modifier.modify_model_architecture( + self.subspace_method_train, + self.layer_mods, + ) ) # train the subspace method and convert train dataset to the latent space @@ -612,17 +623,16 @@ def setup( # we have already trained the subspace method, so load it from the checkpoint self.subspace_method_train = self.subspace_method( - self, - max_epochs=self.max_epochs, - k=None, - train_subspace=False + self, max_epochs=self.max_epochs, k=None, train_subspace=False ) # will return a init subspace method with the subspace models as instance attributes # modify the subspace method architecture if specified if self.layer_mods is not None: - self.subspace_method_train = model_modifier.modify_model_architecture( - self.subspace_method_train, - self.layer_mods, + self.subspace_method_train = ( + model_modifier.modify_model_architecture( + self.subspace_method_train, + self.layer_mods, + ) ) # load checkpoint state dict @@ -656,7 +666,10 @@ def train_dataloader(self): Dataloader for training. """ return DataLoader( - self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers + self.train_dataset, + batch_size=self.batch_size, + shuffle=True, + num_workers=self.num_workers, ) def val_dataloader(self): @@ -669,7 +682,10 @@ def val_dataloader(self): Dataloader for validation. """ return DataLoader( - self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers + self.test_dataset, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, ) @@ -728,23 +744,23 @@ class KFoldDataModule(pl.LightningDataModule): """ def __init__( - self, - fusion_model, - sources, - output_paths, - prediction_task, - batch_size, - num_folds, - multiclass_dimensions, - subspace_method=None, - image_downsample_size=None, - layer_mods=None, - max_epochs=1000, - extra_log_string_dict=None, - own_early_stopping_callback=None, - num_workers=0, - own_kfold_indices=None, - kwargs=None, + self, + fusion_model, + sources, + output_paths, + prediction_task, + batch_size, + num_folds, + multiclass_dimensions, + subspace_method=None, + image_downsample_size=None, + layer_mods=None, + max_epochs=1000, + extra_log_string_dict=None, + own_early_stopping_callback=None, + num_workers=0, + own_kfold_indices=None, + kwargs=None, ): """ Parameters @@ -877,8 +893,8 @@ def kfold_split(self): return folds # list of tuples of (train_dataset, test_dataset) def setup( - self, - checkpoint_path=None, + self, + checkpoint_path=None, ): """ Splits the data into train and test sets, and runs the subspace method if specified @@ -1014,7 +1030,10 @@ def train_dataloader(self, fold_idx): self.train_dataset, self.test_dataset = self.folds[fold_idx] return DataLoader( - self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers + self.train_dataset, + batch_size=self.batch_size, + shuffle=True, + num_workers=self.num_workers, ) def val_dataloader(self, fold_idx): @@ -1034,7 +1053,10 @@ def val_dataloader(self, fold_idx): self.train_dataset, self.test_dataset = self.folds[fold_idx] return DataLoader( - self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers + self.test_dataset, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.num_workers, ) @@ -1078,15 +1100,15 @@ class TrainTestGraphDataModule: """ def __init__( - self, - fusion_model, - sources, - graph_creation_method, - test_size, - image_downsample_size=None, - layer_mods=None, - extra_log_string_dict=None, - own_test_indices=None, + self, + fusion_model, + sources, + graph_creation_method, + test_size, + image_downsample_size=None, + layer_mods=None, + extra_log_string_dict=None, + own_test_indices=None, ): """ Parameters @@ -1174,9 +1196,7 @@ def setup(self): self.test_idxs = test_dataset.indices else: self.test_idxs = self.own_test_indices - self.train_idxs = list( - set(range(len(self.dataset))) - set(self.test_idxs) - ) + self.train_idxs = list(set(range(len(self.dataset))) - set(self.test_idxs)) # get the graph data structure self.graph_maker_instance = self.graph_creation_method(self.dataset) @@ -1247,15 +1267,15 @@ class KFoldGraphDataModule: """ def __init__( - self, - num_folds, - fusion_model, - sources, - graph_creation_method, - image_downsample_size=None, - layer_mods=None, - extra_log_string_dict=None, - own_kfold_indices=None, + self, + num_folds, + fusion_model, + sources, + graph_creation_method, + image_downsample_size=None, + layer_mods=None, + extra_log_string_dict=None, + own_kfold_indices=None, ): """ Parameters @@ -1369,7 +1389,7 @@ def setup(self): # modify the graph maker architecture if specified if self.layer_mods is not None: - graph_maker = model_modifier.modify_model_architecture( + self.graph_maker_instance = model_modifier.modify_model_architecture( self.graph_maker_instance, self.layer_mods, ) @@ -1414,25 +1434,25 @@ def get_lightning_module(self): def prepare_fusion_data( - prediction_task, - fusion_model, - data_paths, - output_paths, - kfold=False, - num_folds=None, - test_size=0.2, - batch_size=8, - multiclass_dimensions=None, - image_downsample_size=None, - layer_mods=None, - max_epochs=1000, - checkpoint_path=None, - extra_log_string_dict=None, - own_early_stopping_callback=None, - num_workers=0, - test_indices=None, - own_kfold_indices=None, - **kwargs, + prediction_task, + fusion_model, + data_paths, + output_paths, + kfold=False, + num_folds=None, + test_size=0.2, + batch_size=8, + multiclass_dimensions=None, + image_downsample_size=None, + layer_mods=None, + max_epochs=1000, + checkpoint_path=None, + extra_log_string_dict=None, + own_early_stopping_callback=None, + num_workers=0, + test_indices=None, + own_kfold_indices=None, + **kwargs, ): """ Gets the data module for a specific fusion model and training protocol. @@ -1497,7 +1517,8 @@ def prepare_fusion_data( if kfold and own_early_stopping_callback is not None: raise ValueError( - "Cannot use own early stopping callback with kfold cross validation yet. Working on fixing this currently (Nov 2023)") + "Cannot use own early stopping callback with kfold cross validation yet. Working on fixing this currently (Nov 2023)" + ) # Getting the data paths from the data_paths dictionary into a list data_sources = [ @@ -1519,7 +1540,7 @@ def prepare_fusion_data( image_downsample_size=image_downsample_size, layer_mods=layer_mods, extra_log_string_dict=extra_log_string_dict, - # here is where the kfold split will go + own_kfold_indices=own_kfold_indices, ) else: graph_data_module = TrainTestGraphDataModule( @@ -1543,7 +1564,9 @@ def prepare_fusion_data( for dm_instance in data_module: dm_instance.data_dims = graph_data_module.data_dims dm_instance.own_early_stopping_callback = own_early_stopping_callback - dm_instance.graph_maker_instance = graph_data_module.graph_maker_instance + dm_instance.graph_maker_instance = ( + graph_data_module.graph_maker_instance + ) dm_instance.output_paths = output_paths dm_instance.num_folds = num_folds dm_instance.prediction_task = prediction_task diff --git a/fusilli/fusionmodels/tabularfusion/mcvae_model.py b/fusilli/fusionmodels/tabularfusion/mcvae_model.py index d1563ab..f11981e 100644 --- a/fusilli/fusionmodels/tabularfusion/mcvae_model.py +++ b/fusilli/fusionmodels/tabularfusion/mcvae_model.py @@ -11,6 +11,7 @@ import pandas as pd import numpy as np from fusilli.utils.training_utils import get_checkpoint_filenames_for_subspace_models +import sys from fusilli.utils import check_model_validity @@ -136,7 +137,9 @@ def load_ckpt(self, checkpoint_path): init_dict = { "n_channels": 2, "lat_dim": self.num_latent_dims, - "n_feats": tuple([self.datamodule.data_dims[0], self.datamodule.data_dims[1]]), + "n_feats": tuple( + [self.datamodule.data_dims[0], self.datamodule.data_dims[1]] + ), } self.fit_model = Mcvae(**init_dict, sparse=True) @@ -261,7 +264,9 @@ def train(self, train_dataset, val_dataset=None): with contextlib.redirect_stdout(None): mcvae_fit.optimize(epochs=self.max_epochs, data=mcvae_training_data) ideal_epoch = mcvae_early_stopping_tol( - tolerance=mcvae_tolerance, patience=mcvae_patience, loss_logs=mcvae_fit.loss["total"] + tolerance=mcvae_tolerance, + patience=mcvae_patience, + loss_logs=mcvae_fit.loss["total"], ) mcvae_esfit = Mcvae(**init_dict, sparse=True) @@ -284,7 +289,9 @@ def train(self, train_dataset, val_dataset=None): # getting mean latent space mean_latents = self.get_latents(mcvae_training_data) - return torch.Tensor(mean_latents), pd.DataFrame(labels, columns=["prediction_label"]) + return torch.Tensor(mean_latents), pd.DataFrame( + labels, columns=["prediction_label"] + ) def convert_to_latent(self, test_dataset): """ @@ -373,7 +380,9 @@ def __init__(self, prediction_task, data_dims, multiclass_dimensions): multiclass_dimensions : int Number of classes in the multiclass classification task. """ - ParentFusionModel.__init__(self, prediction_task, data_dims, multiclass_dimensions) + ParentFusionModel.__init__( + self, prediction_task, data_dims, multiclass_dimensions + ) self.prediction_task = prediction_task