Skip to content

Commit

Permalink
Update the training script for the cnn ms example
Browse files Browse the repository at this point in the history
Update the training script for the cnn ms example
  • Loading branch information
solopku authored Sep 17, 2024
1 parent d5ddd10 commit ddad8e3
Showing 1 changed file with 279 additions and 1 deletion.
280 changes: 279 additions & 1 deletion examples/cnn_ms/train_ms_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def call_with_returns(self, loss):
pn_p_g_list.append([p.name, p, g]) # need iterables
return pn_p_g_list

# MSSGD -- actually no change of code
class MSSGD(MSOptimizer):
"""Implements stochastic gradient descent (optionally with momentum).
Nesterov momentum is based on the formula from `On the importance of initialization and momentum in deep learning`__.
Expand Down Expand Up @@ -205,6 +206,283 @@ def set_states(self, states):
self.mom_value = self.momentum(self.step_counter)


# Data augmentation
def augmentation(x, batch_size):
xpad = np.pad(x, [[0, 0], [0, 0], [4, 4], [4, 4]], 'symmetric')
for data_num in range(0, batch_size):
offset = np.random.randint(8, size=2)
x[data_num, :, :, :] = xpad[data_num, :,
offset[0]:offset[0] + x.shape[2],
offset[1]:offset[1] + x.shape[2]]
if_flip = np.random.randint(2)
if (if_flip):
x[data_num, :, :, :] = x[data_num, :, :, ::-1]
return x


# Calculate accuracy
def accuracy(pred, target):
# y is network output to be compared with ground truth (int)
y = np.argmax(pred, axis=1)
a = y == target
correct = np.array(a, "int").sum()
return correct


# Data partition according to the rank
def partition(global_rank, world_size, train_x, train_y, val_x, val_y):
# Partition training data
data_per_rank = train_x.shape[0] // world_size
idx_start = global_rank * data_per_rank
idx_end = (global_rank + 1) * data_per_rank
train_x = train_x[idx_start:idx_end]
train_y = train_y[idx_start:idx_end]

# Partition evaluation data
data_per_rank = val_x.shape[0] // world_size
idx_start = global_rank * data_per_rank
idx_end = (global_rank + 1) * data_per_rank
val_x = val_x[idx_start:idx_end]
val_y = val_y[idx_start:idx_end]
return train_x, train_y, val_x, val_y


# Function to all reduce NUMPY accuracy and loss from multiple devices
def reduce_variable(variable, dist_opt, reducer):
reducer.copy_from_numpy(variable)
dist_opt.all_reduce(reducer.data)
dist_opt.wait()
output = tensor.to_numpy(reducer)
return output


def resize_dataset(x, image_size):
num_data = x.shape[0]
dim = x.shape[1]
X = np.zeros(shape=(num_data, dim, image_size, image_size),
dtype=np.float32)
for n in range(0, num_data):
for d in range(0, dim):
X[n, d, :, :] = np.array(Image.fromarray(x[n, d, :, :]).resize(
(image_size, image_size), Image.BILINEAR),
dtype=np.float32)
return X


def run(global_rank,
world_size,
local_rank,
max_epoch,
batch_size,
model,
data,
mssgd,
graph,
verbosity,
dist_option='plain',
spars=None,
precision='float32'):
# dev = device.create_cuda_gpu_on(local_rank) # need to change to CPU device for CPU-only machines
dev = device.get_default_device()
dev.SetRandSeed(0)
np.random.seed(0)

if data == 'cifar10':
from data import cifar10
train_x, train_y, val_x, val_y = cifar10.load()
elif data == 'cifar100':
from data import cifar100
train_x, train_y, val_x, val_y = cifar100.load()
elif data == 'mnist':
from data import mnist
train_x, train_y, val_x, val_y = mnist.load()


num_channels = train_x.shape[1]
image_size = train_x.shape[2]
data_size = np.prod(train_x.shape[1:train_x.ndim]).item()
num_classes = (np.max(train_y) + 1).item()

if model == 'resnet':
from model import resnet
model = resnet.resnet50(num_channels=num_channels,
num_classes=num_classes)
elif model == 'xceptionnet':
from model import xceptionnet
model = xceptionnet.create_model(num_channels=num_channels,
num_classes=num_classes)
elif model == 'cnn':
from model import cnn
model = cnn.create_model(num_channels=num_channels,
num_classes=num_classes)
elif model == 'alexnet':
from model import alexnet
model = alexnet.create_model(num_channels=num_channels,
num_classes=num_classes)
elif model == 'mlp':
import os, sys, inspect
current = os.path.dirname(
os.path.abspath(inspect.getfile(inspect.currentframe())))
parent = os.path.dirname(current)
sys.path.insert(0, parent)
from mlp import model
model = model.create_model(data_size=data_size,
num_classes=num_classes)

elif model == 'msmlp':
import os, sys, inspect
current = os.path.dirname(
os.path.abspath(inspect.getfile(inspect.currentframe())))
parent = os.path.dirname(current)
sys.path.insert(0, parent)
from msmlp import model
model = model.create_model(data_size=data_size,
num_classes=num_classes)

# For distributed training, sequential has better performance
if hasattr(mssgd, "communicator"):
DIST = True
sequential = True
else:
DIST = False
sequential = False

if DIST:
train_x, train_y, val_x, val_y = partition(global_rank, world_size,
train_x, train_y, val_x,
val_y)

if model.dimension == 4:
tx = tensor.Tensor(
(batch_size, num_channels, model.input_size, model.input_size), dev,
singa_dtype[precision])
elif model.dimension == 2:
tx = tensor.Tensor((batch_size, data_size), dev, singa_dtype[precision])
np.reshape(train_x, (train_x.shape[0], -1))
np.reshape(val_x, (val_x.shape[0], -1))

ty = tensor.Tensor((batch_size,), dev, tensor.int32)
num_train_batch = train_x.shape[0] // batch_size
num_val_batch = val_x.shape[0] // batch_size
idx = np.arange(train_x.shape[0], dtype=np.int32)

# Attach model to graph
model.set_optimizer(mssgd)
model.compile([tx], is_train=True, use_graph=graph, sequential=sequential)
dev.SetVerbosity(verbosity)

# Training and evaluation loop
for epoch in range(max_epoch):
start_time = time.time()
np.random.shuffle(idx)

if global_rank == 0:
print('Starting Epoch %d:' % (epoch))

# Training phase
train_correct = np.zeros(shape=[1], dtype=np.float32)
test_correct = np.zeros(shape=[1], dtype=np.float32)
train_loss = np.zeros(shape=[1], dtype=np.float32)

model.train()
print ("num_train_batch: \n", num_train_batch)
print ()
for b in range(num_train_batch):
if b % 200 == 0:
print ("b: \n", b)
# Generate the patch data in this iteration
x = train_x[idx[b * batch_size:(b + 1) * batch_size]]
if model.dimension == 4:
x = augmentation(x, batch_size)
if (image_size != model.input_size):
x = resize_dataset(x, model.input_size)
x = x.astype(np_dtype[precision])
y = train_y[idx[b * batch_size:(b + 1) * batch_size]]


synflow_flag = False
# Train the model
if epoch == (max_epoch - 1) and b == (num_train_batch - 1): ### synflow calcuation for the last batch
print ("last epoch calculate synflow")
synflow_flag = True
### step 1: all one input
# Copy the patch data into input tensors
tx.copy_from_numpy(np.ones(x.shape, dtype=np.float32))
ty.copy_from_numpy(y)
### step 2: all weights turned to positive (done)
### step 3: new loss (done)
pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
### step 4: calculate the multiplication of weights
synflow_score = 0.0
for pn_p_g_item in pn_p_g_list:
print ("calculate weight param * grad parameter name: \n", pn_p_g_item[0])
if len(pn_p_g_item[1].shape) == 2: # param_value.data is "weight"
print ("pn_p_g_item[1].shape: \n", pn_p_g_item[1].shape)
synflow_score += np.sum(np.absolute(tensor.to_numpy(pn_p_g_item[1]) * tensor.to_numpy(pn_p_g_item[2])))
print ("synflow_score: \n", synflow_score)
elif epoch == (max_epoch - 1) and b == (num_train_batch - 2): # all weights turned to positive
# Copy the patch data into input tensors
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
train_correct += accuracy(tensor.to_numpy(out), y)
train_loss += tensor.to_numpy(loss)[0]
# all params turned to positive
for pn_p_g_item in pn_p_g_list:
print ("absolute value parameter name: \n", pn_p_g_item[0])
pn_p_g_item[1] = tensor.abs(pn_p_g_item[1]) # tensor actually ...
else: # normal train steps
# Copy the patch data into input tensors
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
# print ("normal before model(tx, ty, synflow_flag, dist_option, spars)")
# print ("train_cnn tx: \n", tx)
# print ("train_cnn ty: \n", ty)
pn_p_g_list, out, loss = model(tx, ty, dist_option, spars, synflow_flag)
# print ("normal after model(tx, ty, synflow_flag, dist_option, spars)")
train_correct += accuracy(tensor.to_numpy(out), y)
train_loss += tensor.to_numpy(loss)[0]

if DIST:
# Reduce the evaluation accuracy and loss from multiple devices
reducer = tensor.Tensor((1,), dev, tensor.float32)
train_correct = reduce_variable(train_correct, mssgd, reducer)
train_loss = reduce_variable(train_loss, mssgd, reducer)

if global_rank == 0:
print('Training loss = %f, training accuracy = %f' %
(train_loss, train_correct /
(num_train_batch * batch_size * world_size)),
flush=True)

# Evaluation phase
model.eval()
for b in range(num_val_batch):
x = val_x[b * batch_size:(b + 1) * batch_size]
if model.dimension == 4:
if (image_size != model.input_size):
x = resize_dataset(x, model.input_size)
x = x.astype(np_dtype[precision])
y = val_y[b * batch_size:(b + 1) * batch_size]
tx.copy_from_numpy(x)
ty.copy_from_numpy(y)
out_test = model(tx)
test_correct += accuracy(tensor.to_numpy(out_test), y)

if DIST:
# Reduce the evaulation accuracy from multiple devices
test_correct = reduce_variable(test_correct, mssgd, reducer)

# Output the evaluation accuracy
if global_rank == 0:
print('Evaluation accuracy = %f, Elapsed Time = %fs' %
(test_correct / (num_val_batch * batch_size * world_size),
time.time() - start_time),
flush=True)

dev.PrintTimeProfiling()


if __name__ == '__main__':
# Use argparse to get command config: max_epoch, model, data, etc., for single gpu training
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -271,4 +549,4 @@ def set_states(self, states):
mssgd,
args.graph,
args.verbosity,
precision=args.precision)
precision=args.precision)

0 comments on commit ddad8e3

Please sign in to comment.