From 286af30fb166fbbc66bd22fa35521be763b80fe0 Mon Sep 17 00:00:00 2001 From: Dave McKay Date: Wed, 1 May 2024 10:45:17 +0100 Subject: [PATCH 01/13] starting training instructions --- docs/data-generation.md | 2 +- docs/training_implementation.md | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 docs/training_implementation.md diff --git a/docs/data-generation.md b/docs/data-generation.md index 63a9fd8..58014f5 100644 --- a/docs/data-generation.md +++ b/docs/data-generation.md @@ -1,6 +1,6 @@ # Data Generation -Following the structure given in the [general data generation](ML_training.md) case, this page describes our implementation for the Hasegawa-Wakatani example. +Following the structure given in the [general data generation](ML_training.md) case, this page describes the data generation phase of our implementation for the Hasegawa-Wakatani example. 1. Fine- and coarse-grained resolutions. We chose: - 1024x1024 for fine-grained simulations; and diff --git a/docs/training_implementation.md b/docs/training_implementation.md new file mode 100644 index 0000000..c52d284 --- /dev/null +++ b/docs/training_implementation.md @@ -0,0 +1,9 @@ +# ML Model Training + +Following on from the [data generation phase](data-generation.md) of our implementation for the Hasegawa-Wakatani example, this page describes how we train our ML models. + +1. Error calculation. + + We are at the stage of having fine-grained simulation trajectories, and from those, extracted data for each timestep, coarsened that data, and run single-timestep coarse-grained simulations. + + Now, the task is to take the difference between timestep 1 and timestep 0 of those coarse-grained simulations. \ No newline at end of file From 47ba4e21a1c464504efda9ef8dbc404d3bc6454b Mon Sep 17 00:00:00 2001 From: Dave McKay Date: Wed, 1 May 2024 11:56:46 +0100 Subject: [PATCH 02/13] removed zero model --- files/ML_model/write_zero_model.py | 96 ------------------------------ files/ML_model/zero_model_test.py | 35 ----------- 2 files changed, 131 deletions(-) delete mode 100644 files/ML_model/write_zero_model.py delete mode 100644 files/ML_model/zero_model_test.py diff --git a/files/ML_model/write_zero_model.py b/files/ML_model/write_zero_model.py deleted file mode 100644 index 094a4cd..0000000 --- a/files/ML_model/write_zero_model.py +++ /dev/null @@ -1,96 +0,0 @@ -import argparse -import os - -parser = argparse.ArgumentParser( - prog='write_zero_model.py', - description='Write a model that produces zero to a file') -parser.add_argument('x', help='grid size x', type=int) -parser.add_argument('z', help='grid size z', type=int) -parser.add_argument('-f', '--filename', help='output filename', required=True) -args = parser.parse_args() - - -from smartsim.ml.tf import freeze_model - -from tensorflow.keras.layers import Input, Normalization, Conv2D -from tensorflow.keras import Model -from tensorflow import keras -import tensorflow as tf - -tf.keras.backend.set_floatx('float64') - -import padding - -import numpy as np - -model = keras.Sequential() - -model.add(Input(shape=(args.x, args.z, 1))) - -#1 -# pad -model.add(padding.CyclicPadding2D(padding=(1,1))) -model.add(keras.layers.ZeroPadding2D(padding=([(1,1), (0,0)]))) - -model.add(Conv2D (filters=64, kernel_size=3, padding='valid', activation='relu')) - -#2 -# pad -model.add(padding.CyclicPadding2D(padding=(1,1))) -model.add(keras.layers.ZeroPadding2D(padding=([(1,1), (0,0)]))) - -model.add(Conv2D (filters=64, kernel_size=3, padding='valid', activation='relu')) - -#3 -# pad -model.add(padding.CyclicPadding2D(padding=(1,1))) -model.add(keras.layers.ZeroPadding2D(padding=([(1,1), (0,0)]))) - -model.add(Conv2D(filters=64, kernel_size=3, padding='valid', activation='relu')) - -#4 -# pad -model.add(padding.CyclicPadding2D(padding=(1,1))) -model.add(keras.layers.ZeroPadding2D(padding=([(1,1), (0,0)]))) - -model.add(Conv2D (filters=64, kernel_size=3, padding ='valid', activation='relu')) -#model.add(Normalization(axis=-1, mean=None, variance=None))\ - -#5 -# pad -model.add(padding.CyclicPadding2D(padding=(1,1))) -model.add(keras.layers.ZeroPadding2D(padding=([(1,1), (0,0)]))) - -model.add(Conv2D (filters=64, kernel_size=3, padding='valid', activation='relu')) -#model.add(Normalization(axis=-1, mean=None, variance=None))\ - -#6 -# pad -model.add(padding.CyclicPadding2D(padding=(1,1))) -model.add(keras.layers.ZeroPadding2D(padding=([(1,1), (0,0)]))) - -model.add(Conv2D (filters=64, kernel_size=3, padding ='valid', activation='relu')) - -model.add(Conv2D (filters=1, kernel_size=3, padding ='same', activation='relu')) - -model.summary() - -model.compile( - # Optimizer - optimizer=keras.optimizers.Adam(learning_rate=0.001), - # Loss function to minimize - loss=keras.losses.MeanSquaredError() - # List of metrics to monitor - #metrics=[keras.metrics.SparseCategoricalAccuracy()], -) - -last_layer = len(model.layers) - 1 -zero_weights = [tf.zeros(shape=(3,3,64,1)), tf.zeros(shape=(1,))] -model.layers[last_layer].set_weights(zero_weights) - -# SmartSim utility for Freezing the model and saving it to a file. -model_path, inputs, outputs = freeze_model(model, os.getcwd(), args.filename) - -print(model_path) -print(inputs) -print(outputs) diff --git a/files/ML_model/zero_model_test.py b/files/ML_model/zero_model_test.py deleted file mode 100644 index 05d7adc..0000000 --- a/files/ML_model/zero_model_test.py +++ /dev/null @@ -1,35 +0,0 @@ -from smartredis import Client -from smartsim import Experiment - -import numpy as np - -exp = Experiment("Inference-Test", launcher="local") - -db = exp.create_database(port=6899, interface="lo") -exp.start(db) - -print(db.get_address()) - -model_path = '/path/to/zero-model-xxx-zzz.pb' -inputs = ['args_0'] -outputs = ['Identity'] - -client = Client(address=db.get_address()[0], cluster=False) - -client.set_model_from_file( - "keras_fcn", model_path, "TF", device="CPU", inputs=inputs, outputs=outputs -) - -# put random random input tensor into the database -input_data = np.random.rand(1, 260, 256, 1).astype(np.float64) -client.put_tensor("input", input_data) - -# run the Fully Connected Network model on the tensor we just put -# in and store the result of the inference at the "output" key -client.run_model("keras_fcn", "input", "output") - -# get the result of the inference -pred = client.get_tensor("output") -print(pred) - -exp.stop(db) From 49e6987db1b7d37e7174b1cc8b3c45431bd8a901 Mon Sep 17 00:00:00 2001 From: Amy Krause Date: Wed, 1 May 2024 12:04:26 +0100 Subject: [PATCH 03/13] training pipeline --- files/training/README.md | 7 +- files/training/data_read.py | 69 ++++++++++++++++++ files/training/model.py | 82 +++++++++++++++++++++ files/training/padding.py | 75 +++++++++++++++++++ files/training/tf_utils.py | 56 +++++++++++++++ files/training/training.py | 140 ++++++++++++++++++++++++++++++++++++ 6 files changed, 423 insertions(+), 6 deletions(-) create mode 100644 files/training/data_read.py create mode 100644 files/training/model.py create mode 100644 files/training/padding.py create mode 100644 files/training/tf_utils.py create mode 100644 files/training/training.py diff --git a/files/training/README.md b/files/training/README.md index b37b5a4..5075b80 100644 --- a/files/training/README.md +++ b/files/training/README.md @@ -1,6 +1 @@ -**instructions.md** - -- Data Driven Algorithms for Exascale -- Exeter, 30.-31. January 2024 - -A complete walk-through from installation of all software to running the ML-boosted pipeline with zero-model (no effective correction) \ No newline at end of file +Training pipeline for training the error correction ML model. \ No newline at end of file diff --git a/files/training/data_read.py b/files/training/data_read.py new file mode 100644 index 0000000..799ef41 --- /dev/null +++ b/files/training/data_read.py @@ -0,0 +1,69 @@ +""" Functions to load/augment training dataset """ +import tensorflow as tf +import numpy as np +import netCDF4 as nc + +from typing import List, Tuple, Dict + +def extract_array_data(file_path: str, args) -> np.ndarray: + dataset = nc.Dataset(file_path, 'r') + + # number of ghost cells in x dimension + gx = 2 + # extract vorticity and density without ghost cells and remove unit y direction + vort_array = np.squeeze(dataset.variables['vort'][:,gx:-gx,:,:]) + dens_array = np.squeeze(dataset.variables['n'][:,gx:-gx,:,:]) + dataset.close() + + if args.vort_only: + flow_image = np.stack([vort_array], axis=-1) + elif args.dens_only: + flow_image = np.stack([dens_array], axis=-1) + else: + flow_image = np.stack([vort_array, dens_array], axis=-1) + return flow_image + +def translate_augmentation(fields: Dict[str, tf.Tensor]) -> Dict[str, tf.Tensor]: + coarse_image, error_image = fields['coarse'], fields['error'] + #commented out for testing + #if coarse_image.shape != error_image.shape: + # raise ValueError(f"Coarse grained data and error should be same shape (got {coarse_image.shape} and {error_image.shape} respectively).") + shape = tf.shape(coarse_image) + nx, nz = shape[0], shape[1] + shift_x = tf.random.uniform(shape=[], minval=0, maxval=nx-1, dtype=tf.int32) + shift_z = tf.random.uniform(shape=[], minval=0, maxval=nz-1, dtype=tf.int32) + + # apply same shift to coarse snapshot and error + coarse_shifted = tf.roll(coarse_image, shift_x, 0) + coarse_shifted = tf.roll(coarse_shifted, shift_z, 1) + error_shifted = tf.roll(error_image, shift_x, 0) + error_shifted = tf.roll(error_shifted, shift_z, 1) + return {'coarse': coarse_shifted, 'error': error_shifted} + +def data_generator(ground_truth_file_names: List[str], coarse_grained_file_names: List[str], args): + for gt_file, cg_file in zip(ground_truth_file_names, coarse_grained_file_names): + raw_data_gt = extract_array_data(gt_file, args) + raw_data_cg = extract_array_data(cg_file, args)[1:] + error = raw_data_gt - raw_data_cg + + # Reshape tensors to have dynamic dimensions + raw_data_cg = tf.convert_to_tensor(raw_data_cg, dtype=tf.float64) + error = tf.convert_to_tensor(error, dtype=tf.float64) + for i in range(raw_data_cg.shape[0]): + yield {'coarse': raw_data_cg[i], 'error': error[i]} + +def generate_augmented_dataset( + ground_truth_file_names: List[str], + coarse_grained_file_names: List[str], + args, +) -> tf.data.Dataset: + if args.vort_only or args.dens_only: + channels = 1 + else: + channels = 2 + dataset = tf.data.Dataset.from_generator( + lambda: data_generator(ground_truth_file_names, coarse_grained_file_names, args), + output_signature={'coarse': tf.TensorSpec(shape=(None, None, channels), dtype=tf.float64), + 'error': tf.TensorSpec(shape=(None, None, channels), dtype=tf.float64)} + ) + return dataset.map(translate_augmentation) diff --git a/files/training/model.py b/files/training/model.py new file mode 100644 index 0000000..f84c509 --- /dev/null +++ b/files/training/model.py @@ -0,0 +1,82 @@ +from tensorflow.keras.layers import Input, Normalization, Conv2D +from tensorflow.keras import Model +from tensorflow import keras +from typing import Tuple + +import padding + +''' +preprocessing: + +- "cyclic" padding along the flow direction (wall dimension is 0-padded): + the input padded with 2 columns on both sides that 'wrap' around (see np.pad('wrap')) + two rows of 0 on top and bottom + ! needs to be reapplied after every convolution layer +''' + +''' +TODO/preprocessing: + +- rescale input to [-1, 1]: + To rescale an input in the [0, 255] range to be in the [-1, 1] range, + you would pass scale=1./127.5, offset=-1. + in general: scale = scaled_max/(max * .5) + offset = min + scaled_min + keras.layers.Rescaling(scale, offset=0.0, **kwargs) + + keras seems not to support min-max scaling with variable min/max, so will normalise instead +''' + +def kochkov_cnn(image_shape: Tuple[int]) -> keras.Model: + """ Todo: need a more automated way of padding e.g. if we adjust filter size. """ + model = keras.Sequential() + + model.add(Input(shape=image_shape)) + + #1 + # pad + model.add(padding.CyclicPadding2D(padding=(1,1))) +# model.add(keras.layers.ZeroPadding2D(padding=(1,0))) + + model.add(Conv2D (filters=64, kernel_size=3, padding ='valid', activation='relu')) + + #2 + # pad + model.add(padding.CyclicPadding2D(padding=(1,1))) +# model.add(keras.layers.ZeroPadding2D(padding=(1,0))) + + model.add(Conv2D (filters =64, kernel_size =3, padding ='valid', activation='relu')) + + #3 + # pad + model.add(padding.CyclicPadding2D(padding=(1,1))) +# model.add(keras.layers.ZeroPadding2D(padding=(1,0))) + + model.add(Conv2D (filters =64, kernel_size =3, padding ='valid', activation='relu')) + + #4 + # pad + model.add(padding.CyclicPadding2D(padding=(1,1))) +# model.add(keras.layers.ZeroPadding2D(padding=(1,0))) + + model.add(Conv2D (filters =64, kernel_size =3, padding ='valid', activation='relu')) + + #5 + # pad + model.add(padding.CyclicPadding2D(padding=(1,1))) +# model.add(keras.layers.ZeroPadding2D(padding=(1,0))) + + model.add(Conv2D (filters =64, kernel_size =3, padding ='valid', activation='relu')) + + #6 + # pad + model.add(padding.CyclicPadding2D(padding=(1,1))) + # model.add(keras.layers.ZeroPadding2D(padding=(1,0))) + + model.add(Conv2D (filters =64, kernel_size =3, padding ='valid', activation='relu')) + + # output + model.add(padding.CyclicPadding2D(padding=(1,1))) + + model.add(Conv2D(filters=image_shape[2], kernel_size =3, padding ='valid', activation='linear')) + return model diff --git a/files/training/padding.py b/files/training/padding.py new file mode 100644 index 0000000..acd8b9c --- /dev/null +++ b/files/training/padding.py @@ -0,0 +1,75 @@ +import tensorflow as tf + +# from keras.engine.base_layer import Layer +# from keras.engine.input_spec import InputSpec +# from keras.utils import conv_utils +from tensorflow.keras.layers import Layer +from tensorflow.keras.layers import InputSpec +from tensorflow.python.keras.utils import conv_utils + +# some ideas here: +# https://stackoverflow.com/questions/54911015/keras-convolution-layer-on-images-coming-from-circular-cyclic-domain + +class CyclicPadding2D(Layer): + def __init__(self, padding=(1, 1), data_format=None, **kwargs): + super().__init__(**kwargs) + self.data_format = conv_utils.normalize_data_format(data_format) + if len(padding) != 2: + raise ValueError('`padding` should have two elements. ' + f'Received: {padding}.') + self.padding = padding + self.input_spec = InputSpec(ndim=4) + + def get_config(self): + config = super().get_config() + config.update({ + "padding": self.padding, + "data_format": self.data_format, + }) + return config + + def compute_output_shape(self, input_shape): + input_shape = tf.TensorShape(input_shape).as_list() + if self.data_format == 'channels_first': + if input_shape[2] is not None: + rows = input_shape[2] + 2 * self.padding[0] + else: + rows = None + if input_shape[3] is not None: + cols = input_shape[3] + 2 * self.padding[1] + else: + cols = None + return tf.TensorShape( + [input_shape[0], input_shape[1], rows, cols]) + elif self.data_format == 'channels_last': + if input_shape[1] is not None: + rows = input_shape[1] + 2 * self.padding[0] + else: + rows = None + if input_shape[2] is not None: + cols = input_shape[2] + 2 * self.padding[1] + else: + cols = None + return tf.TensorShape([input_shape[0], rows, cols, input_shape[3]]) + + def call(self, inputs): + tensor = inputs + ndim = len(inputs.shape) + for ax, pd in enumerate(self.padding): + if self.data_format == "channels_last": + #(batch, rows, cols, channels) + axis = 1 + ax + elif self.data_format == "channels_first": + #(batch, channels, rows, cols) + axis = 2 + ax + else: + return + sl_start = [slice(None, pd) if i == axis else slice(None) for i in range(ndim)] + sl_end = [slice(-pd, None) if i == axis else slice(None) for i in range(ndim)] + tensor = tf.concat([ + tensor[sl_end], + tensor, + tensor[sl_start], + ], axis) + + return tensor diff --git a/files/training/tf_utils.py b/files/training/tf_utils.py new file mode 100644 index 0000000..af99fd0 --- /dev/null +++ b/files/training/tf_utils.py @@ -0,0 +1,56 @@ +import typing as t +from pathlib import Path + +import keras +import tensorflow as tf +from tensorflow.python.framework.convert_to_constants import ( + convert_variables_to_constants_v2, +) + +def freeze_model( + model: keras.Model, output_dir: str, file_name: str +) -> t.Tuple[str, t.List[str], t.List[str]]: + """Freeze a Keras or TensorFlow Graph + + to use a Keras or TensorFlow model in SmartSim, the model + must be frozen and the inputs and outputs provided to the + smartredis.client.set_model_from_file() method. + + This utiliy function provides everything users need to take + a trained model and put it inside an ``orchestrator`` instance + + :param model: TensorFlow or Keras model + :type model: tf.Module + :param output_dir: output dir to save model file to + :type output_dir: str + :param file_name: name of model file to create + :type file_name: str + :return: path to model file, model input layer names, model output layer names + :rtype: str, list[str], list[str] + """ + # TODO figure out why layer names don't match up to + # specified name in Model init. + + if not file_name.endswith(".pb"): + file_name = file_name + ".pb" + + full_model = tf.function(model) + full_model = full_model.get_concrete_function( + tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype) + ) + + frozen_func = convert_variables_to_constants_v2(full_model) + frozen_func.graph.as_graph_def() + + input_names = [x.name.split(":")[0] for x in frozen_func.inputs] + output_names = [x.name.split(":")[0] for x in frozen_func.outputs] + + tf.io.write_graph( + graph_or_graph_def=frozen_func.graph, + logdir=output_dir, + name=file_name, + as_text=False, + ) + model_file_path = str(Path(output_dir, file_name).resolve()) + return model_file_path, input_names, output_names + diff --git a/files/training/training.py b/files/training/training.py new file mode 100644 index 0000000..9bb6847 --- /dev/null +++ b/files/training/training.py @@ -0,0 +1,140 @@ +""" Setup training loop for basic model """ +import tensorflow as tf +import model +import data_read as dr +from model import kochkov_cnn +from datetime import datetime +import argparse + +# monitoring and debugging through tensorboard +#log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") +#tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) +#tf.debugging.experimental.enable_dump_debug_info(log_dir, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1) + +tf.keras.utils.disable_interactive_logging() + +parser = argparse.ArgumentParser(description='Model training') +parser.add_argument('-lr', '--learning-rate', type=float, required=True) +parser.add_argument('-b', '--batch-size', type=int, required=True) +parser.add_argument('-ep', '--epochs', type=int, required=True) +parser.add_argument('-id', '--task-id', default='') +parser.add_argument('--vort-only', action='store_true',) +parser.add_argument('--dens-only', action='store_true',) +args = parser.parse_args() + +# problem size +Nx = 256 +Nz = 256 +if args.vort_only or args.dens_only: + channels = 1 +else: + channels = 2 +val_frac = 0.2 + +samples_per_file = 1000 # to estimate train/val split +#data_location = '/scratch/space1/d175/data/training/derived/notebooks/' +data_location = '/work/d175/d175/akexcml/smartsim/python/data/' +file_nums = list(range(1,33)) # [ x+1 for x in range(32) ] + +# training protocol +#learning_rate = 1e-3 +#epochs = 2 +#batch_size = 32 +learning_rate = args.learning_rate +epochs = args.epochs +batch_size = args.batch_size + +trun_label = datetime.now().strftime("%Y%m%d-%H%M%S") +if args.task_id: + trun_label = f'{trun_label}-{args.task_id}' +log_dir = f"logs/fit/{trun_label}" +checkpoint_filepath = 'checkpoints/' + trun_label + '/weights.{epoch:03d}.hdf5' + +print('****************************************************') +print(f'learning rate: {learning_rate}') +print(f'epochs: {epochs}') +print(f'batch size: {batch_size}') +print(f'training run: {trun_label}') +print(f'data files: {file_nums}') +print(f'channels: {channels}', end='') +if args.vort_only: + print(' (vort)') +elif args.dens_only: + print(' (dens)') +else: + print('') +print('****************************************************') + +# compile model +model = kochkov_cnn((Nx, Nz, channels)) +model.summary() +model.compile(loss='mean_squared_error', run_eagerly=False, jit_compile=False, optimizer=tf.keras.optimizers.Adam(jit_compile=False)) + +# error = gt - sim +# load ground truth (gt) and coarse-grained (='sim', cg) +file_paths_gt = [data_location + 'gt_traj_' + str(n) + '.nc' for n in file_nums] +file_paths_cg = [data_location + 'sim_traj_' + str(n) + '.nc' for n in file_nums] +n_samples = samples_per_file * len(file_paths_cg) + +# generate dataset (from generator, data not preloaded) +dataset = dr.generate_augmented_dataset(file_paths_gt, file_paths_cg, args) +dataset = dataset.shuffle(buffer_size=100) +dataset.prefetch(batch_size) + +# split the dataset into training and validation sets +train_size = int(n_samples * (1 - val_frac)) +val_size = n_samples - train_size +train_dataset = dataset.take(train_size) +val_dataset = dataset.skip(train_size) + +# training +def input_and_target(sample): + """ Ensure data is correctly sized and split into input/target """ + return ( + tf.image.resize(sample['coarse'], (Nx, Nz)), + tf.image.resize(sample['error'], (Nx, Nz)) + ) + +# batch the training and validation datasets +train_dataset = train_dataset.map(input_and_target).batch(batch_size) +val_dataset = val_dataset.map(input_and_target).batch(batch_size) + +# monitor training through tensorboard +#tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1) +#tf.debugging.experimental.enable_dump_debug_info(log_dir, tensor_debug_mode="FULL_HEALTH", circular_buffer_size=-1) + +# checkpoint weights after each epoch +model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=checkpoint_filepath, + save_weights_only=True, +) + +history = model.fit( + train_dataset, + validation_data=val_dataset, + epochs=epochs, + callbacks=[ +# tensorboard_callback, + model_checkpoint_callback, + ], +) +for key in history.history.keys(): + print(key) + print(history.history[key]) + +print('***************************************') +print('Freeze model: ', end='') +try: + # try if smartsim is available + from smartsim.ml.tf import freeze_model + print('Using smartsim') +except: + print('Failed to load smartsim, using tf_utils') + from tf_utils import freeze_model +import os + +model_path, inputs, outputs = freeze_model(model, os.getcwd(), f"model-hw-{trun_label}.pb") +print(model_path) +print(inputs) +print(outputs) +print('***************************************') From 5cd2c7bbaab4c6049a430067129912146b6ac44c Mon Sep 17 00:00:00 2001 From: Amy Krause Date: Wed, 1 May 2024 12:06:55 +0100 Subject: [PATCH 04/13] remove instructions for workshop hands-on --- files/training/instructions.md | 258 --------------------------------- 1 file changed, 258 deletions(-) delete mode 100644 files/training/instructions.md diff --git a/files/training/instructions.md b/files/training/instructions.md deleted file mode 100644 index 98dc0c3..0000000 --- a/files/training/instructions.md +++ /dev/null @@ -1,258 +0,0 @@ -# hands-on: Simulation and Machine Learning Integration -**(ExCALIBUR workshop: Data Driven Algorithms)** - -## 1. Cirrus - -Log in to Cirrus: - -`$ ssh username@login.cirrus.ac.uk` - -or, if you used any non-default name or location for your private SSH key, `$ ssh username@login.cirrus.ac.uk -i /home/user/.ssh/id_rsa_cirrus` . - -Feel free to have a look around. - - -### File systems - -There are two file systems we will concern ourselves with, `/home` and `/work`. On logging in you will find yourself in your home directory at -`/home/tc057/tc057/username`. The `/home` file system is not particularly large but can be used to store some important files. Importantly, it is not mounted on the compute nodes. Jobs will instead be run from within the `/work` file system. - -You will have your own work directory at `/work/tc057/tc057/username`. Disc space is shared between all members of the project (i.e. all attendees and demonstrators at this event). Please keep this for the exercises only, and clean up if you accidentally produce any huge files. - -Your home and work directories are kept private to you alone. If you want to share files with anyone else, or if the demonstrators want to share files with you, the shared directories can be used. These exist on both file systems in two hierarchies. To share with other users in this project (who are also members of the tc057 Unix group), use -`/work/tc057/tc057/shared`, and to share with anyone on any other project you can use `/work/tc057/shared` - -You may still need to set read permissions on anything you copy into these directories. For example, to recursively set group read and execute permissions on a directory, allowing other tc057 project members to read it and its contents: -`$ chmod -R g+rX /work/tc057/tc057/shared/mydirectory`. - - -### Modules - -Environment modules are available. You can use the commands you are probably used to -``` -$ module list -$ module avail -$ module load -``` -to list the currently loaded modules, see what other modules are available, and then to load a module. - - -### Running jobs - -Jobs are run via the Slurm batch system. The job scripts you will use will be covered in the training material. You will be able to run jobs via the normal QoS if you like, but we have also set up reservations on Cirrus to allow our group exclusive access to several compute nodes, ensuring quick job throughput. - -In a Slurm job script the account (budget to charge), partition (group or type of nodes to run on) and QoS (type of job, determining the limits that apply) will be specified by options to the sbatch command used to submit the job. To use a given reservation, you must also provide its code. All in all, you should provide the following options in your scripts: -``` -#SBATCH --account=tc057 -#SBATCH --partition=standard -#SBATCH --qos=reservation -#SBATCH --reservation=tc057_1141276 -``` - -### Further reading - -If you would like to read more about using Cirrus, the documentation is available online at https://docs.cirrus.ac.uk . - - -## 2. Build the individual components - - - -First, set up `.bashrc` and `.bash_login`: login to Cirrus, then enter the following: -``` -echo "source /work/tc057/tc057/$USER/.bashrc" > .bash_login -echo "export WORK=/work${HOME#/home}" > /work/tc057/tc057/$USER/.bashrc -echo "export HOME=$WORK" >> /work/tc057/tc057/$USER/.bashrc -echo "cd $WORK" >> /work/tc057/tc057/$USER/.bashrc -``` -`exit` and log in again to complete this setup. Do `pwd` to check you are in `/work/tc057/tc057/$USER` . - -### BOUT++ - -Load required modules: -``` -module load mpt -module load intel-compilers-19 -module load fftw/3.3.10-intel19-mpt225 -module load netcdf-parallel/4.6.2-intel19-mpt225 -module load cmake -``` - -BOUT++ installation requires a Python environment. We'll use miniconda3: -``` -mkdir -p ~/miniconda3 -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh -bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3 -~/miniconda3/bin/conda init bash -``` - -Now `exit` and login to Cirrus again to complete the miniconda3 setup. - -Create and activate a conda environment: -``` -conda create -y --name boutsmartsim python=3.10 -conda activate boutsmartsim -``` - -Build BOUT++: -``` -module load mpt -module load intel-compilers-19 -module load fftw/3.3.10-intel19-mpt225 -module load netcdf-parallel/4.6.2-intel19-mpt225 -module load cmake - -git clone https://github.com/boutproject/BOUT-dev.git -cd /work/tc057/tc057/$USER/BOUT-dev - -MPICXX_CXX=icpc MPICC_CC=icc MPICXX=icpc cmake . -B build -DBOUT_DOWNLOAD_NETCDF_CXX4=ON -DBOUT_USE_LAPACK=off -DCMAKE_CXX_FLAGS=-std=c++17 -DCMAKE_BUILD_TYPE=Release - -export PYTHONPATH=~/BOUT-dev/build/tools/pylib:~/BOUT-dev/tools/pylib:$PYTHONPATH - -cmake --build build -j 6 - -cd ~ -``` - -#### Example: Hasegawa-Wakatani -This will build a *pure* BOUT++ version of the Hasegawa-Wakatani example. A build with SmartSim connection capability is described later. - -In `/work/tc057/tc057/$USER/BOUT-dev/`: -``` -MPICXX_CXX=icpc MPICXX=mpicxx cmake . --build build -DBOUT_BUILD_EXAMPLES=on - -cmake --build build --target hasegawa-wakatani -``` - -### SmartSim and it's ML wrapper - - -#### Python/conda environment - -Add the following packages to install SmartSim ML wrapper: -``` -conda install -y git-lfs -git lfs install -pip install smartsim[ml] -``` - -Build: -``` -module load intel-compilers-19 -module load cmake -smart build --device cpu -``` - -#### Build SmartRedis libraries - -Clone the git repo and the required version and build: -``` -git clone https://github.com/CrayLabs/SmartRedis.git --branch v0.5.0 -cd SmartRedis/ -make lib -``` - -The install path is then available in `SmartRedis/install`. - -### ML-models - -Activate the conda environment with SmartSim -``` -conda activate boutsmartsim -``` - -In this example, we are using a grid 256x256 with 4 guard cells in the x-dimension, hence our model expects a grid of size (260, 256). These dimensions need to match on the resolution specified in the simulation's input file, `/work/tc057/tc057/shared/simulation/run_SmartSim/BOUT.inp`. - -To demonstrate the workflow, we use a model that returns a tensor of 0s, this allows us to easily verify that the added ML-loop does not distort the simulation in any unexpected way. All python scripts used in this section are available in `/work/tc057/tc057/shared/ML_model/`. - -First, we export the ML model to a format suitable for SmartSim -- `zero_model-260-256.pb`, using the `write_zero_model.py` script in your main directory: -``` -cd /work/tc057/tc057/$USER - -python /work/tc057/tc057/shared/simulation/ML_model/write_zero_model.py 260 256 -f zero-model-260-256.pb -``` - -The `write_zero_model.py` uses the target CNN architecture with a modified final layer to force all-0s output while maintaining properties of the model, such as computational effort needed to add the ML inference to the workflow. - -Note that the script requires also `padding.py`; this is our in-house implementation of periodic padding which is currently not implemented in TensorFlow, and you need to have a copy of it in the directory you are running `write_zero_model.py` from. - -You can now test the zero model: -``` -cp /work/tc057/tc057/shared/simulation/ML_model/zero_model_test.py ~ -``` - -Modify the "zero_model_test.py` to have the correct model_path (line 13) and tensor shape (line 24). -``` -python zero_model_test.py -``` -This script launches a database and uploads the zero model. It generates a random tensor and uses it as input for the model inference, which should return a tensor of the same dimensions filled with zero. The output gets printed on screen so that one can easily verify the content of the returned tensor. - -> **Try this:** Clone and modify the python scripts to generate an arbitrary "bad" model, and repeat the steps above to test that this new model returns a "bad" tensor. -> -> What makes a model "bad"? - - -## 4. Simulation - -### Compile the example (modified Hasegawa-Wakatani) - -Hasegawa-Wakatani system of equations is included among BOUT++ examples and is part of the installation scripts in /shared/BOUT-dev. We will, however, need to make few modifications to make it run using SmartRedis. - -With `boutsmartsim` still active, load the following modules: -``` -module load mpt -module load intel-compilers-19 -module load fftw/3.3.10-intel19-mpt225 -module load netcdf-parallel/4.6.2-intel19-mpt225 -module load cmake -``` - -Make a working copy of the hasegawa-wakatani example outside the BOUT-dev root directory -``` -cp -r BOUT-dev/examples/hasegawa-wakatani my-bout-smartsim-hw -cd my-bout-smartsim-hw -``` - -Compile your version of Hasegawa-Wakatani example. - -Modify the Hasegawa-Wakatani example: `my-bout-smartsim-hw` should contain (among others) `CMakeLists.txt` and `hw.cxx`. Replace these files with their modified version available in `/work/tc057/tc057/shared/simulation/modified_HW/`. -- The changes to `hw.cxx` implement the call onto the SmartRedis database and the CNN within, and receives and actions the correction to the simulation. -- You will need to further update this version of `CMakeLists.txt` so that it points to the path to the SmartRedis libraries - edit line 5 `set(SMARTREDIS_INSTALL_PATH /path/to/SmartRedis/install)`. - -Set the location of the BOUT++ build path for CMake -``` -cmake . -B build -Dbout++_DIR=../BOUT-dev/build -DCMAKE_CXX_FLAGS=-std=c++17 -DCMAKE_BUILD_TYPE=Release -cmake --build build --target hasegawa-wakatani -``` - - -### Run mHW with ML-model - -Create a run folder and copy over a BOUT input file: -``` -mkdir ~/run -cp /work/tc057/tc057/shared/simulation/run_SmartSim/BOUT.inp ~/run -``` - -You can see an example job-submission script in `/work/tc057/tc057/shared/simulation/run_SmartSim/submit-hw.sh`. This slurm job file starts the SmartSim orchestrator (in Python) with a Redis database and RedisAI communication layer. In this example, the Redis DB runs on the same node since the simulation only runs in one process. The script sets up the environment as needed, in particular, it - -1. starts the RedisAI database and uploads the ML model using a python script `/work/tc057/tc057/shared/simulation/run_SmartSim/start_db.py` with appropriate arguments specifying the port where the database will be available and path to the ML model. This must be done before the simulation starts (or both added to an orchestrator): - the line `python start_db.py 6899 /work/tc057/tc057/$USER/zero-model-260-256.pb` - needs to be modified so that it points to your instance of the ML model (i.e., `/work/tc057/tc057/$USER/zero-model-260-256.p`) - and a suitable `start_db.pb` --- either in `/shared`, or feel free to make and use your local copy where you are launching the script from. - - -2. Sets the environment variable SSDB to points to the database entrypoint to which the simulation connects. - Our example uses 6899, but this can be arbitrarily changed in case a conflict occurs. - -For additional runs, copy this to your `/work` folder, create a new folder for the outputs, e.g. `hw-run-1`, and edit line 36 to match the new folder (`RUN_FOLDER=~/hw-run-1`). - -Note: this scripts expects `zero-model-260-256.pb` to be in your /work folder. Edit line 32 if you have created it elsewhere. - -This slurm job file starts the SmartSim orchestrator (in Python) with a Redis database and RedisAI communication layer. In this example, the Redis DB runs on the same node since the simulation only runs in one process. - -> **Try this:** Run mHW with the "bad" ML-model. - - - - From c54d4ade04a88ba07aae01a8bb7737713af2d1af Mon Sep 17 00:00:00 2001 From: akrause2014 Date: Wed, 1 May 2024 12:08:02 +0100 Subject: [PATCH 05/13] Update hw.cxx --- files/modified_HW/hw.cxx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/files/modified_HW/hw.cxx b/files/modified_HW/hw.cxx index 5acc5c9..ce14bc1 100644 --- a/files/modified_HW/hw.cxx +++ b/files/modified_HW/hw.cxx @@ -85,9 +85,9 @@ class HW : public PhysicsModel { } int outputMonitor(BoutReal simtime, int iter, int nout) { - output << "iteration = " << iter << std::endl; + // output << "iteration = " << iter << std::endl; if (iter >= 0) { - output << "setting correction to true" << std::endl; + // output << "setting correction to true" << std::endl; addCorrection = true; } return 0; @@ -150,7 +150,7 @@ class HW : public PhysicsModel { } addCorrection = false; - output << "setting correction to false" << std::endl; + // output << "setting correction to false" << std::endl; } // Solve for potential From f21f7814112d701d2eeced9f228fce78b76a2c86 Mon Sep 17 00:00:00 2001 From: Amy Krause Date: Wed, 1 May 2024 12:11:32 +0100 Subject: [PATCH 06/13] sbatch script for training --- files/training/submit-training.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 files/training/submit-training.sh diff --git a/files/training/submit-training.sh b/files/training/submit-training.sh new file mode 100644 index 0000000..688dcf3 --- /dev/null +++ b/files/training/submit-training.sh @@ -0,0 +1,23 @@ +#!/bin/bash +# +#SBATCH --partition=gpu +#SBATCH --qos=gpu +#SBATCH --gres=gpu:1 +#SBATCH --time=48:00:00 +#SBATCH --account=x01 + +CUDA_VERSION=11.6 +CUDNN_VERSION=8.6.0-cuda-${CUDA_VERSION} +TENSORRT_VERSION=8.4.3.1-u2 + +module load intel-20.4/compilers +module load nvidia/cudnn/${CUDNN_VERSION} +module load nvidia/tensorrt/${TENSORRT_VERSION} +module load nvidia/nvhpc + +conda activate boutsmartsim + +cd /path/to/SiMLInt/files/training + +# choose appropriate parameters here +python training.py --epochs 100 --batch-size 32 --learning-rate 0.0001 From fefb33f696951e248fdc1d5d8e9dbcb9a0458106 Mon Sep 17 00:00:00 2001 From: akrause2014 Date: Wed, 1 May 2024 12:13:48 +0100 Subject: [PATCH 07/13] Update training.py --- files/training/training.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/files/training/training.py b/files/training/training.py index 9bb6847..aff0489 100644 --- a/files/training/training.py +++ b/files/training/training.py @@ -32,8 +32,7 @@ val_frac = 0.2 samples_per_file = 1000 # to estimate train/val split -#data_location = '/scratch/space1/d175/data/training/derived/notebooks/' -data_location = '/work/d175/d175/akexcml/smartsim/python/data/' +data_location = '/scratch/space1/d175/amy/resize_again/training_data/' file_nums = list(range(1,33)) # [ x+1 for x in range(32) ] # training protocol From faaac44129032c4e47df2b8e1f50dddaab40b81e Mon Sep 17 00:00:00 2001 From: Amy Krause Date: Wed, 1 May 2024 12:26:09 +0100 Subject: [PATCH 08/13] renaming --- .../fine_init.sh | 0 .../fine_trajectories.sh | 0 .../resize_trajectory.py | 0 files/{coarsening => 2-coarsening}/restart.py | 0 .../submit-resize.sh | 0 .../coarse_BOUT.inp | 0 .../run_coarse_sims.sh | 0 files/{training => 5-training}/README.md | 0 files/{training => 5-training}/data_read.py | 0 files/{training => 5-training}/model.py | 0 files/{training => 5-training}/padding.py | 0 .../submit-training.sh | 0 files/{training => 5-training}/tf_utils.py | 0 files/{training => 5-training}/training.py | 0 .../CMakeLists.txt | 0 files/{modified_HW => 6-HW-with-model}/hw.cxx | 0 files/ML_model/padding.py | 61 ------------------- 17 files changed, 61 deletions(-) rename files/{data-generation => 1-data-generation}/fine_init.sh (100%) rename files/{data-generation => 1-data-generation}/fine_trajectories.sh (100%) rename files/{coarsening => 2-coarsening}/resize_trajectory.py (100%) rename files/{coarsening => 2-coarsening}/restart.py (100%) rename files/{coarsening => 2-coarsening}/submit-resize.sh (100%) rename files/{coarse_simulations => 3-coarse_simulations}/coarse_BOUT.inp (100%) rename files/{coarse_simulations => 3-coarse_simulations}/run_coarse_sims.sh (100%) rename files/{training => 5-training}/README.md (100%) rename files/{training => 5-training}/data_read.py (100%) rename files/{training => 5-training}/model.py (100%) rename files/{training => 5-training}/padding.py (100%) rename files/{training => 5-training}/submit-training.sh (100%) rename files/{training => 5-training}/tf_utils.py (100%) rename files/{training => 5-training}/training.py (100%) rename files/{modified_HW => 6-HW-with-model}/CMakeLists.txt (100%) rename files/{modified_HW => 6-HW-with-model}/hw.cxx (100%) delete mode 100644 files/ML_model/padding.py diff --git a/files/data-generation/fine_init.sh b/files/1-data-generation/fine_init.sh similarity index 100% rename from files/data-generation/fine_init.sh rename to files/1-data-generation/fine_init.sh diff --git a/files/data-generation/fine_trajectories.sh b/files/1-data-generation/fine_trajectories.sh similarity index 100% rename from files/data-generation/fine_trajectories.sh rename to files/1-data-generation/fine_trajectories.sh diff --git a/files/coarsening/resize_trajectory.py b/files/2-coarsening/resize_trajectory.py similarity index 100% rename from files/coarsening/resize_trajectory.py rename to files/2-coarsening/resize_trajectory.py diff --git a/files/coarsening/restart.py b/files/2-coarsening/restart.py similarity index 100% rename from files/coarsening/restart.py rename to files/2-coarsening/restart.py diff --git a/files/coarsening/submit-resize.sh b/files/2-coarsening/submit-resize.sh similarity index 100% rename from files/coarsening/submit-resize.sh rename to files/2-coarsening/submit-resize.sh diff --git a/files/coarse_simulations/coarse_BOUT.inp b/files/3-coarse_simulations/coarse_BOUT.inp similarity index 100% rename from files/coarse_simulations/coarse_BOUT.inp rename to files/3-coarse_simulations/coarse_BOUT.inp diff --git a/files/coarse_simulations/run_coarse_sims.sh b/files/3-coarse_simulations/run_coarse_sims.sh similarity index 100% rename from files/coarse_simulations/run_coarse_sims.sh rename to files/3-coarse_simulations/run_coarse_sims.sh diff --git a/files/training/README.md b/files/5-training/README.md similarity index 100% rename from files/training/README.md rename to files/5-training/README.md diff --git a/files/training/data_read.py b/files/5-training/data_read.py similarity index 100% rename from files/training/data_read.py rename to files/5-training/data_read.py diff --git a/files/training/model.py b/files/5-training/model.py similarity index 100% rename from files/training/model.py rename to files/5-training/model.py diff --git a/files/training/padding.py b/files/5-training/padding.py similarity index 100% rename from files/training/padding.py rename to files/5-training/padding.py diff --git a/files/training/submit-training.sh b/files/5-training/submit-training.sh similarity index 100% rename from files/training/submit-training.sh rename to files/5-training/submit-training.sh diff --git a/files/training/tf_utils.py b/files/5-training/tf_utils.py similarity index 100% rename from files/training/tf_utils.py rename to files/5-training/tf_utils.py diff --git a/files/training/training.py b/files/5-training/training.py similarity index 100% rename from files/training/training.py rename to files/5-training/training.py diff --git a/files/modified_HW/CMakeLists.txt b/files/6-HW-with-model/CMakeLists.txt similarity index 100% rename from files/modified_HW/CMakeLists.txt rename to files/6-HW-with-model/CMakeLists.txt diff --git a/files/modified_HW/hw.cxx b/files/6-HW-with-model/hw.cxx similarity index 100% rename from files/modified_HW/hw.cxx rename to files/6-HW-with-model/hw.cxx diff --git a/files/ML_model/padding.py b/files/ML_model/padding.py deleted file mode 100644 index fda4013..0000000 --- a/files/ML_model/padding.py +++ /dev/null @@ -1,61 +0,0 @@ -import tensorflow as tf -import numpy as np - -from keras.engine.base_layer import Layer -from keras.engine.input_spec import InputSpec -from keras.utils import conv_utils - -# some ideas here: -# https://stackoverflow.com/questions/54911015/keras-convolution-layer-on-images-coming-from-circular-cyclic-domain - -class CyclicPadding2D(Layer): - def __init__(self, padding=(1, 1), data_format=None, **kwargs): - super().__init__(**kwargs) - self.data_format = conv_utils.normalize_data_format(data_format) - if len(padding) != 2: - raise ValueError('`padding` should have two elements. ' - f'Received: {padding}.') - self.padding = padding - self.input_spec = InputSpec(ndim=4) - - def compute_output_shape(self, input_shape): - input_shape = tf.TensorShape(input_shape).as_list() - if self.data_format == 'channels_first': - if input_shape[2] is not None: - rows = input_shape[2] - else: - rows = None - if input_shape[3] is not None: - cols = input_shape[3] + self.padding[0] + self.padding[1] - else: - cols = None - return tf.TensorShape( - [input_shape[0], input_shape[1], rows, cols]) - elif self.data_format == 'channels_last': - if input_shape[1] is not None: - rows = input_shape[1] - else: - rows = None - if input_shape[2] is not None: - cols = input_shape[2] + self.padding[0] + self.padding[1] - else: - cols = None - return tf.TensorShape([input_shape[0], rows, cols, input_shape[3]]) - - def call(self, inputs): - if self.data_format == "channels_last": - #(batch, rows, cols, channels) - axis = 2 - return tf.concat([ - inputs[:,:,-self.padding[0]:,:], - inputs, - inputs[:,:,:self.padding[1],:] - ], axis) - elif self.data_format == "channels_first": - #(batch, channels, rows, cols) - axis = 3 - return tf.concat([ - inputs[:,:,:,-self.padding[0]:], - inputs, - inputs[:,:,:,:self.padding[1]] - ], axis) From 2aeaef553b05e322153d4c068f108419ad469ac3 Mon Sep 17 00:00:00 2001 From: Amy Krause Date: Wed, 1 May 2024 12:33:23 +0100 Subject: [PATCH 09/13] remove number from hw source folder --- files/{6-HW-with-model => HW-error-correction}/CMakeLists.txt | 0 files/{6-HW-with-model => HW-error-correction}/hw.cxx | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename files/{6-HW-with-model => HW-error-correction}/CMakeLists.txt (100%) rename files/{6-HW-with-model => HW-error-correction}/hw.cxx (100%) diff --git a/files/6-HW-with-model/CMakeLists.txt b/files/HW-error-correction/CMakeLists.txt similarity index 100% rename from files/6-HW-with-model/CMakeLists.txt rename to files/HW-error-correction/CMakeLists.txt diff --git a/files/6-HW-with-model/hw.cxx b/files/HW-error-correction/hw.cxx similarity index 100% rename from files/6-HW-with-model/hw.cxx rename to files/HW-error-correction/hw.cxx From 20677fca0c83f3917c89a1f157b844eb4909a8da Mon Sep 17 00:00:00 2001 From: Amy Krause Date: Wed, 1 May 2024 12:36:01 +0100 Subject: [PATCH 10/13] rename --- files/{run_SmartSim => 6-simulation}/BOUT.inp | 0 files/{run_SmartSim => 6-simulation}/start_db.py | 0 files/{run_SmartSim => 6-simulation}/submit-hw.sh | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename files/{run_SmartSim => 6-simulation}/BOUT.inp (100%) rename files/{run_SmartSim => 6-simulation}/start_db.py (100%) rename files/{run_SmartSim => 6-simulation}/submit-hw.sh (100%) diff --git a/files/run_SmartSim/BOUT.inp b/files/6-simulation/BOUT.inp similarity index 100% rename from files/run_SmartSim/BOUT.inp rename to files/6-simulation/BOUT.inp diff --git a/files/run_SmartSim/start_db.py b/files/6-simulation/start_db.py similarity index 100% rename from files/run_SmartSim/start_db.py rename to files/6-simulation/start_db.py diff --git a/files/run_SmartSim/submit-hw.sh b/files/6-simulation/submit-hw.sh similarity index 100% rename from files/run_SmartSim/submit-hw.sh rename to files/6-simulation/submit-hw.sh From afc0a4d3fa37fc1e976be4fb6dd6c39b90ace855 Mon Sep 17 00:00:00 2001 From: Dave McKay Date: Wed, 1 May 2024 12:39:23 +0100 Subject: [PATCH 11/13] added step 5 to data-generation --- docs/data-generation.md | 16 ++++-- files/4-training-data/gen_training_nc.py | 52 ++++++++++++++++++++ files/4-training-data/sub_gen_training_nc.sh | 16 ++++++ 3 files changed, 79 insertions(+), 5 deletions(-) create mode 100644 files/4-training-data/gen_training_nc.py create mode 100644 files/4-training-data/sub_gen_training_nc.sh diff --git a/docs/data-generation.md b/docs/data-generation.md index 58014f5..4db1da3 100644 --- a/docs/data-generation.md +++ b/docs/data-generation.md @@ -30,9 +30,9 @@ Following the structure given in the [general data generation](ML_training.md) c cmake --build build --target hasegawa-wakatani ``` - Before simulating the training data, a burn-in run must be conducted at the desired resolution. For an example of this, see [fine_init.sh](../files/data-generation/fine_init.sh). Edit `` on line 9 and `x01` in lines containing paths to match your `$WORK` and desired `/scratch` locations and submit via `sbatch fine_init.sh`. + Before simulating the training data, a burn-in run must be conducted at the desired resolution. For an example of this, see [fine_init.sh](../files/1-data-generation/fine_init.sh). Edit `` on line 9 and `x01` in lines containing paths to match your `$WORK` and desired `/scratch` locations and submit via `sbatch fine_init.sh`. - Following that, we run a number of sequentially trajectories to generate fine-grained ground-truth data. See [fine_trajectories.sh](../files/data-generation/fine_trajectories.sh) + Following that, we run a number of sequentially trajectories to generate fine-grained ground-truth data. See [fine_trajectories.sh](../files/1-data-generation/fine_trajectories.sh) The initial simulation produces "restart files", `/scratch/space1/x01/data/my-scratch-data/initial/data/BOUT.restart.*.nc` from which a simulation can be continued. Those, as well as the input file (`/scratch/space1/x01/data/my-scratch-data/initial/data/BOUT.inp` should be placed in `/scratch/space1/x01/data/my-scratch-data/0`. @@ -40,14 +40,20 @@ Following the structure given in the [general data generation](ML_training.md) c 3. Coarsen selected simulation snapshots. - Fine-grained data must be coarsened to match the desired coarse-grained resolution. This can be done via interpolation for a general solution. Files in [files/coarsening](../files/coarsening) perform this task. Submit `submit-resize.sh` via `sbatch submit-resize.sh`. + Fine-grained data must be coarsened to match the desired coarse-grained resolution. This can be done via interpolation for a general solution. Files in [files/2-coarsening](../files/2-coarsening) perform this task. Submit `submit-resize.sh` via `sbatch submit-resize.sh`. _Note: this operates on one trajectory at a time and will therefore need to be repeated for each trajectory run in step 2. 4. Single-timestep coarse simulations. - With the previous step having extracted fine-grained data for each time step (and each trajectory for which it was repeated), we now need to run a single-timestep coarse-grained simulation. To do this, see [files/coarse_simulations](../files/coarse_simulations/). Submitting [run_coarse_sims.sh](../files/coarse_simulations/run_coarse_sims.sh) will run a single step simulation for each coarsened timestep created in the previous step. + With the previous step having extracted fine-grained data for each time step (and each trajectory for which it was repeated), we now need to run a single-timestep coarse-grained simulation. To do this, see [files/3-coarse_simulations](../files/3-coarse_simulations/). Submitting [run_coarse_sims.sh](../files/3-coarse_simulations/run_coarse_sims.sh) will run a single step simulation for each coarsened timestep created in the previous step. -Subsequent steps: calculating the error; reformatting data for ingestion into TensorFlow; and model training are covered in [ML model training implementation](training_implementation.md). +5. Generating training data. + + We now have all of the data required to train the ML models, but not in the format we require. Files in [files/4-training-data](../files/4-training-data) perform this task. Edit [files/4-training-data/](../files/4-training-data/sub_gen_training_nc.sh) and [files/4-training-data/gen_training_nc.py](../files/4-training-data/gen_training_nc.py) so that the paths work with your setup. + + _Note: paths are hardcoded in [files/4-training-data/gen_training_nc.py](../files/4-training-data/gen_training_nc.py), not read in from the command line. + +Subsequent steps: calculating the error and model training are covered in [ML model training implementation](training_implementation.md). diff --git a/files/4-training-data/gen_training_nc.py b/files/4-training-data/gen_training_nc.py new file mode 100644 index 0000000..53c7f0f --- /dev/null +++ b/files/4-training-data/gen_training_nc.py @@ -0,0 +1,52 @@ +#!/usr/env/python +# Script to generate training .nc files from BOUT coarse sim files +# Based on traj_netcdf.ipynb +import sys +import numpy as np +import xarray as xr +from xbout import open_boutdataset +from tqdm import tqdm, trange + +basedir = '//scratch/space1/x01/data/my-scratch-data' +outdir = '/scratch/space1/x01/data/my-scratch-data/training/training_nc' + +def read_traj(traj): + dvort0 = [] + dvort1 = [] + dn0 = [] + dn1 = [] + for i in trange(0, 1001): + ds = open_boutdataset( + f'{basedir}/trajectory_{traj}/{i}/coarse_sim/BOUT.dmp.*.nc', + info=False) + dvort0.append(ds['vort'][0,:,:,:]) + dvort1.append(ds['vort'][1,:,:,:]) + dn0.append(ds['n'][0,:,:,:]) + dn1.append(ds['n'][1,:,:,:]) + tvort0 = xr.concat(dvort0[1:], 't') + tn0 = xr.concat(dn0[1:], 't') + tvort1 = xr.concat(dvort1[:1001], 't') + tn1 = xr.concat(dn1[:1001], 't') + d0 = xr.merge([tvort0,tn0]) + d1 = xr.merge([tvort1,tn1]) + return d0, d1 + +def clean(ds): + if 'metadata' in ds.attrs: + del ds.attrs['metadata'] + if 'options' in ds.attrs: + del ds.attrs['options'] + for variable in ds.variables.values(): + if 'metadata' in variable.attrs: + del variable.attrs['metadata'] + if 'options' in variable.attrs: + del variable.attrs['options'] + +traj = sys.argv[1] +d0, d1 = read_traj(traj) +clean(d0) +clean(d1) +d0.to_netcdf(f'{outdir}/gt_traj_{traj}.nc') +d1.to_netcdf(f'{outdir}/sim_traj_{traj}.nc') +#err=d0-d1 +#err.to_netcdf(f'{outdir}/err_traj_{traj}.nc') diff --git a/files/4-training-data/sub_gen_training_nc.sh b/files/4-training-data/sub_gen_training_nc.sh new file mode 100644 index 0000000..6d8b6ca --- /dev/null +++ b/files/4-training-data/sub_gen_training_nc.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +#SBATCH --nodes=1 +#SBATCH --ntasks=1 +# #SBATCH --exclusive +#SBATCH --time=01:00:00 +#SBATCH --partition=standard +#SBATCH --qos=standard +#SBATCH --account= + +eval "$(/work/x01/x01/$USER/miniconda3/bin/conda shell.bash hook)" +conda activate boutsmartsim + +TRAJECTORY=1 + +python gen_training_nc.py $TRAJECTORY From 594cc2579de37e5f9bd3bf51c4a2ace1e8252140 Mon Sep 17 00:00:00 2001 From: Dave McKay Date: Wed, 1 May 2024 12:41:40 +0100 Subject: [PATCH 12/13] added note to training_implementation.md --- docs/training_implementation.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/training_implementation.md b/docs/training_implementation.md index c52d284..13e98b6 100644 --- a/docs/training_implementation.md +++ b/docs/training_implementation.md @@ -6,4 +6,6 @@ Following on from the [data generation phase](data-generation.md) of our impleme We are at the stage of having fine-grained simulation trajectories, and from those, extracted data for each timestep, coarsened that data, and run single-timestep coarse-grained simulations. - Now, the task is to take the difference between timestep 1 and timestep 0 of those coarse-grained simulations. \ No newline at end of file + Now, the task is to take the difference between timestep 1 and timestep 0 of those coarse-grained simulations. + + UNFINISHED - email d.mckay@epcc.ed.ac.uk if you get this far!! \ No newline at end of file From 406f63cd4e76c6929e215ad16328f212d6880044 Mon Sep 17 00:00:00 2001 From: Amy Krause Date: Wed, 1 May 2024 12:41:43 +0100 Subject: [PATCH 13/13] run simulation with models --- files/6-simulation/start_db.py | 14 +++++++----- files/6-simulation/submit-hw.sh | 39 ++++++++++++++++----------------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/files/6-simulation/start_db.py b/files/6-simulation/start_db.py index fb35ca7..1951828 100644 --- a/files/6-simulation/start_db.py +++ b/files/6-simulation/start_db.py @@ -7,8 +7,8 @@ db_port = int(sys.argv[1]) -#model_path = '/work/d175/shared/zero-model-256.pb' -model_path = sys.argv[2] +vort_model_path = sys.argv[2] +n_model_path = sys.argv[3] exp = Experiment("Inference-Test", launcher="local") @@ -17,14 +17,16 @@ print(f'Started Redis database at {db.get_address()[0]}') -# these need to match the outputs from 'write_zero_model.py' +# these need to match the outputs from the model freeze call inputs = ['args_0'] outputs = ['Identity'] client = Client(address=db.get_address()[0], cluster=False) client.set_model_from_file( - "hw_zero_model", model_path, "TF", device="CPU", inputs=inputs, outputs=outputs + "hw_model_vort", vort_model_path, "TF", device="CPU", inputs=inputs, outputs=outputs ) -print('Uploaded model') - +client.set_model_from_file( + "hw_model_n", n_model_path, "TF", device="CPU", inputs=inputs, outputs=outputs +) +print('Uploaded models') diff --git a/files/6-simulation/submit-hw.sh b/files/6-simulation/submit-hw.sh index f96dfe6..378b9f2 100644 --- a/files/6-simulation/submit-hw.sh +++ b/files/6-simulation/submit-hw.sh @@ -2,22 +2,19 @@ #SBATCH --job-name=boutsmartsim #SBATCH --time=0:20:00 -#SBATCH --exclusive #SBATCH --nodes=1 -#SBATCH --tasks-per-node=36 +#SBATCH --tasks-per-node=4 #SBATCH --cpus-per-task=1 -#SBATCH --account=tc057 +#SBATCH --account=x01 #SBATCH --partition=standard #SBATCH --qos=standard -source /work/tc057/tc057/$USER/.bashrc - -# Setup the job environment (this module needs to be loaded before any other modules) -module load mpt -module load intel-compilers-19 -module load fftw/3.3.10-intel19-mpt225 -module load netcdf-parallel/4.6.2-intel19-mpt225 +# Setup the job environment +module load intel-20.4/mpi +module load intel-20.4/compilers +module load fftw/3.3.10-intel20.4-impi20.4 +module load netcdf-parallel/4.9.2-intel20-impi20 # Set the number of threads to 1 # This prevents any threaded system libraries from automatically @@ -27,18 +24,20 @@ export OMP_NUM_THREADS=1 # activate conda environment for SmartSim and SmartRedis Python packages conda activate boutsmartsim -# run folder -RUN_FOLDER=~/run -cd $RUN_FOLDER -mkdir $RUN_FOLDER/data -cp /work/tc057/tc057/shared/simulation/run_SmartSim/BOUT.inp $RUN_FOLDER/data -cp /work/tc057/tc057/shared/simulation/run_SmartSim/start_db.py $RUN_FOLDER +cd /path/to/run/ # Start the orchestrator and a new experiment which launches RedisAI for communication -# Load the model from the given file -# Remember to adjust the path to the start_db.py script! -python start_db.py 6899 /work/tc057/tc057/$USER/zero-model-260-256.pb +# Load the vorticity and density models from their files +model_vort=/scratch/space1/d175/amy/full_stack/data-model/cnn/model-hw-20240427-164026-vort.pb +model_n=/scratch/space1/d175/amy/full_stack/data-model/cnn/model-hw-20240427-210530-dens.pb +python start_db.py 6899 $model_vort $model_n +echo "Started Redis" export SSDB=127.0.0.1:6899 +executable=/path/to/build/hasegawa-wakatani -srun -n 1 --distribution=block:block --hint=nomultithread ~/my-bout-smartsim-hw/build/hasegawa-wakatani +# Run the simulation +srun -n 1 --distribution=block:block --hint=nomultithread $executable \ + restart=true append=false \ + solver:type=rk4 solver:adaptive=false solver:timestep=0.026 \ + nout=10 timestep=0.026 mesh:nx=260 mesh:nz=256 mesh:dx=0.1 mesh:dz=0.1