-
Notifications
You must be signed in to change notification settings - Fork 20
/
fit.py
212 lines (164 loc) · 6.43 KB
/
fit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import os, time
from typing import Any
from functools import partial
from pathlib import Path
from flax import nnx
from tqdm import tqdm
import optax
import jax
import jax.numpy as jnp
import orbax.checkpoint as ocp
# from flax.training import train_state, orbax_utils
import tensorboardX as tbx
def banner_message(message):
if isinstance(message, str):
message = [message]
elif not isinstance(message, list) or not all(isinstance(m, str) for m in message):
raise ValueError("message should be a string or a list of strings.")
msg_len = max(46, max(len(msg) for msg in message))
# Top border
print("\33[1;32m╔═{:═<{width}}═╗".format("", width=msg_len))
# Message lines
for msg in message:
print("║ {:^{width}} ║".format(msg, width=msg_len))
# Bottom border
print("╚═{:═<{width}}═╝\33[0m".format("", width=msg_len))
def lr_schedule(lr, steps_per_epoch, epochs=100, warmup=5):
return optax.warmup_cosine_decay_schedule(
init_value=lr / 10,
peak_value=lr,
end_value=lr / 100,
warmup_steps=steps_per_epoch * warmup,
decay_steps=steps_per_epoch * (epochs - warmup),
)
@nnx.jit
def eval_step(model, batch):
x, y = batch
logits = model(x)
acc = jnp.equal(jnp.argmax(logits, -1), y).mean()
return acc
@nnx.jit
def loss_fn(logits, labels, epoch=None):
loss = optax.softmax_cross_entropy(
logits=logits,
labels=jax.nn.one_hot(labels, 10),
).mean()
return loss, {'loss': loss}
@partial(nnx.jit, static_argnums=(3,))
def train_step(model, optimizer: nnx.Optimizer, batch, loss_fn, epoch):
x, y = batch
def compute_loss(model):
logits = model(x)
return loss_fn(logits, y, epoch)
grad_fn = nnx.value_and_grad(compute_loss, has_aux=True)
(loss, loss_dict), grads = grad_fn(model)
optimizer.update(grads)
return loss_dict
def load_ckpt(model, ckpt_dir):
if ckpt_dir is None or not os.path.exists(ckpt_dir):
banner_message(["No checkpoint was loaded", "Training from scratch"])
return model
checkpointer = ocp.StandardCheckpointer()
graphdef, abstract_state = nnx.split(model)
state_restored = checkpointer.restore(os.path.abspath(ckpt_dir), abstract_state)
model = nnx.merge(graphdef, state_restored)
return model
def fit(model,
train_ds, test_ds,
optimizer:nnx.Optimizer,
loss_fn=loss_fn,
num_epochs=100,
eval_step=None,
eval_freq=1,
log_name='default',
hparams=None,
):
# logging
banner_message(["Start training", "Device > {}".format(", ".join([str(i) for i in jax.devices()]))])
Path("checkpoints").mkdir(exist_ok=True, parents=True)
ckpt_path = os.path.abspath("checkpoints")
checkpointer = ocp.StandardCheckpointer()
writer = tbx.SummaryWriter("logs/{}_{}".format(log_name, time.strftime("%m%d%H%M%S")))
best_acc = .0
# start training
for epoch in range(1, num_epochs + 1):
model.train()
pbar = tqdm(train_ds)
for batch in pbar:
## if batch is not from tfds.as_numpy, convert it to numpy
batch = jax.tree_map(lambda x: x._numpy(), batch)
loss_dict = train_step(model, optimizer, batch, loss_fn, epoch)
lr = 0.1
pbar.set_description(f'Epoch {epoch:3d}, lr: {lr:.7f}, loss: {loss_dict["loss"]:.4f}')
steps = optimizer.step.value
if steps % 10 == 0 or steps == 1:
# writer.add_scalar('train/learning_rate', lr, steps)
for k, v in loss_dict.items():
writer.add_scalar(f'train/{k}', v, steps)
writer.flush()
if eval_step is None:
_, state = nnx.split(model)
checkpointer.save(os.path.join(ckpt_path, str(epoch)), state, force=True)
elif epoch % eval_freq == 0:
acc = []
model.eval()
for batch in test_ds:
## if batch is not from tfds.as_numpy, convert it to numpy
batch = jax.tree_map(lambda x: x._numpy(), batch)
a = eval_step(model, batch)
acc.append(a)
acc = 0 if len(acc) == 0 else jnp.stack(acc).mean()
pbar.write(f'Epoch {epoch:3d}, test acc: {acc:.6f}')
writer.add_scalar('test/accuracy', acc, epoch)
if acc > best_acc:
_, state = nnx.split(model)
checkpointer.save(os.path.join(ckpt_path, str(epoch)), state, force=True)
best_acc = acc
banner_message(["Training finished", f"Best test acc: {best_acc:.6f}"])
if hparams is not None:
writer.add_hparams(hparams, {'metric/accuracy': best_acc}, name='hparam')
writer.close()
banner_message(["Device > {}".format(", ".join([str(i) for i in jax.devices()]))])
if __name__ == "__main__":
from model import Model
import tensorflow_datasets as tfds
def get_train_batches(batch_size=256):
ds = tfds.load(name='mnist', split='train', as_supervised=True, shuffle_files=True)
ds = ds.batch(batch_size).prefetch(1)
# return tfds.as_numpy(ds)
return ds # debug for some reason, tfds.as_numpy(ds) is not working
def get_test_batches(batch_size=256):
ds = tfds.load(name='mnist', split='test', as_supervised=True, shuffle_files=False)
ds = ds.batch(batch_size).prefetch(1)
# return tfds.as_numpy(ds)
return ds
config = {
'lr': 5e-3,
'batch_size': 128,
'num_epochs': 10,
'warmup': 3,
}
train_ds = get_train_batches(batch_size=config['batch_size'])
test_ds = get_test_batches(batch_size=config['batch_size'])
lr_fn = lr_schedule(lr=config['lr'], steps_per_epoch=len(train_ds), epochs=config['num_epochs'], warmup=config['warmup'])
key = nnx.Rngs(0)
model = Model(key)
x = jnp.ones((1, 28, 28, 1))
optimizer = nnx.Optimizer(model, optax.nadam(lr_fn))
fit(model, train_ds, test_ds,
optimizer=optimizer,
loss_fn=loss_fn,
eval_step=eval_step,
eval_freq=1,
num_epochs=config['num_epochs'],
hparams=config,
log_name='mnist')
model = load_ckpt(model, "./checkpoints")
acc = []
model.eval()
for batch in test_ds:
batch = jax.tree_map(lambda x: x._numpy(), batch)
a = eval_step(model, batch)
acc.append(a)
acc = jnp.stack(acc).mean()
print("Eval Accuracy: {:.6f}".format(acc))