-
Notifications
You must be signed in to change notification settings - Fork 143
/
example.yaml
159 lines (137 loc) · 12.2 KB
/
example.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# a simple example config file
# Two folders will be used during the training: 'root'/process and 'root'/'run_name'
# run_name contains logfiles and saved models
# process contains processed data sets
# if 'root'/'run_name' exists, 'root'/'run_name'_'year'-'month'-'day'-'hour'-'min'-'s' will be used instead.
root: results/toluene
run_name: example-run-toluene
seed: 123 # model seed
dataset_seed: 456 # data set seed
append: true # set true if a restarted run should append to the previous log file
# see https://arxiv.org/abs/2304.10061 for discussion of numerical precision
default_dtype: float64
model_dtype: float32
allow_tf32: true # consider setting to false if you plan to mix training/inference over any devices that are not NVIDIA Ampere or later
# network
r_max: 4.0 # cutoff radius in length units, here Angstrom, this is an important hyperparamter to scan
num_layers: 4 # number of interaction blocks, we find 3-5 to work best
l_max: 2 # the maximum irrep order (rotation order) for the network's features, l=2 is accurate but slower, l=1 if you want to be faster but less accurte
parity: true # whether to include features with odd mirror parity; often turning parity off gives equally good results but faster networks, so do consider this
num_features: 32 # the multiplicity of the features, 32 is a good default for accurate network, if you want to be more accurate, go larger, if you want to be faster, go lower
nonlinearity_type: gate # may be 'gate' or 'norm', 'gate' is recommended
# scalar nonlinearities to use — available options are silu, ssp (shifted softplus), tanh, and abs.
# Different nonlinearities are specified for e (even) and o (odd) parity;
# note that only tanh and abs are correct for o (odd parity)
# silu typically works best for even
nonlinearity_scalars:
e: silu
o: tanh
nonlinearity_gates:
e: silu
o: tanh
# radial network basis
num_basis: 8 # number of basis functions used in the radial basis, 8 usually works best
BesselBasis_trainable: true # set true to train the bessel weights
PolynomialCutoff_p: 6 # p-exponent used in polynomial cutoff function, smaller p corresponds to stronger decay with distance
# radial network
invariant_layers: 2 # number of radial layers, usually 1-3 works best, smaller is faster
invariant_neurons: 64 # number of hidden neurons in radial function, smaller is faster
avg_num_neighbors: auto # number of neighbors to divide by, null => no normalization, auto computes it based on dataset
use_sc: true # use self-connection or not, usually gives big improvement
# data set
# the keys used need to be stated at least once in key_mapping, npz_fixed_field_keys or npz_keys
# key_mapping is used to map the key in the npz file to the NequIP default values (see data/_key.py)
# all arrays are expected to have the shape of (nframe, natom, ?) except the fixed fields
# note that if your data set uses pbc, you need to also pass an array that maps to the nequip "pbc" key
dataset: npz # type of data set, can be npz or ase
dataset_url: http://quantum-machine.org/gdml/data/npz/toluene_ccsd_t.zip # url to download the npz. optional
dataset_file_name: ./benchmark_data/toluene_ccsd_t-train.npz # path to data set file
key_mapping:
z: atomic_numbers # atomic species, integers
E: total_energy # total potential eneriges to train to
F: forces # atomic forces to train to
R: pos # raw atomic positions
npz_fixed_field_keys: # fields that are repeated across different examples
- atomic_numbers
# A list of atomic types to be found in the data. The NequIP types will be named with the chemical symbols, and inputs with the correct atomic numbers will be mapped to the corresponding types.
chemical_symbols:
- H
- C
# logging
wandb: true # we recommend using wandb for logging
wandb_project: toluene-example # project name used in wandb
verbose: info # the same as python logging, e.g. warning, info, debug, error; case insensitive
log_batch_freq: 100 # batch frequency, how often to print training errors withinin the same epoch
log_epoch_freq: 1 # epoch frequency, how often to print
save_checkpoint_freq: -1 # frequency to save the intermediate checkpoint. no saving of intermediate checkpoints when the value is not positive.
save_ema_checkpoint_freq: -1 # frequency to save the intermediate ema checkpoint. no saving of intermediate checkpoints when the value is not positive.
# training
n_train: 100 # number of training data
n_val: 50 # number of validation data
learning_rate: 0.005 # learning rate, we found values between 0.01 and 0.005 to work best - this is often one of the most important hyperparameters to tune
batch_size: 5 # batch size, we found it important to keep this small for most applications including forces (1-5); for energy-only training, higher batch sizes work better
validation_batch_size: 10 # batch size for evaluating the model during validation. This does not affect the training results, but using the highest value possible (<=n_val) without running out of memory will speed up your training.
max_epochs: 100000 # stop training after _ number of epochs, we set a very large number, as e.g. 1million and then just use early stopping and not train the full number of epochs
train_val_split: random # can be random or sequential. if sequential, first n_train elements are training, next n_val are val, else random, usually random is the right choice
shuffle: true # if true, the data loader will shuffle the data, usually a good idea
metrics_key: validation_loss # metrics used for scheduling and saving best model. Options: `set`_`quantity`, set can be either "train" or "validation, "quantity" can be loss or anything that appears in the validation batch step header, such as f_mae, f_rmse, e_mae, e_rmse
use_ema: true # if true, use exponential moving average on weights for val/test, usually helps a lot with training, in particular for energy errors
ema_decay: 0.99 # ema weight, typically set to 0.99 or 0.999
ema_use_num_updates: true # whether to use number of updates when computing averages
report_init_validation: true # if True, report the validation error for just initialized model
# early stopping based on metrics values.
early_stopping_patiences: # stop early if a metric value stopped decreasing for n epochs
validation_loss: 50
early_stopping_lower_bounds: # stop early if a metric value is lower than the bound
LR: 1.0e-5
early_stopping_upper_bounds: # stop early if the training appears to have exploded
validation_loss: 1.0e+4
# loss function
loss_coeffs:
forces: 1 # if using PerAtomMSELoss, a default weight of 1:1 on each should work well
total_energy:
- 1
- PerAtomMSELoss
# output metrics
metrics_components:
- - forces # key
- mae # "rmse" or "mae"
- - forces
- rmse
- - forces
- mae
- PerSpecies: True # if true, per species contribution is counted separately
report_per_component: False # if true, statistics on each component (i.e. fx, fy, fz) will be counted separately
- - forces
- rmse
- PerSpecies: True
report_per_component: False
- - total_energy
- mae
- - total_energy
- mae
- PerAtom: True # if true, energy is normalized by the number of atoms
# optimizer, may be any optimizer defined in torch.optim
# the name `optimizer_name`is case sensitive
# IMPORTANT: for NequIP (not for Allegro), we find that in most cases AMSGrad strongly improves
# out-of-distribution generalization over Adam. We highly recommed trying both AMSGrad (by setting
# optimizer_amsgrad: true) and Adam (by setting optimizer_amsgrad: false)
optimizer_name: Adam
optimizer_amsgrad: true
# lr scheduler, currently only supports the two options listed in full.yaml, i.e. on-pleteau and cosine annealing with warm restarts, if you need more please file an issue
# here: on-plateau, reduce lr by factory of lr_scheduler_factor if metrics_key hasn't improved for lr_scheduler_patience epoch
lr_scheduler_name: ReduceLROnPlateau
lr_scheduler_patience: 100
lr_scheduler_factor: 0.5
# we provide a series of options to shift and scale the data
# these are for advanced use and usually the defaults work very well
# the default is to scale the atomic energy and forces by scaling them by the force standard deviation and to shift the energy by the mean atomic energy
# in certain cases, it can be useful to have a trainable shift/scale and to also have species-dependent shifts/scales for each atom
# initial atomic energy shift for each species. default to the mean of per atom energy. Optional
# the value can be a constant float value, an array for each species, or a string that defines a statistics over the training dataset
# if numbers are explicitly provided, they must be in the same energy units as the training data
per_species_rescale_shifts: dataset_per_atom_total_energy_mean
# initial atomic energy scale for each species. Optional.
# the value can be a constant float value, an array for each species, or a string
# if numbers are explicitly provided, they must be in the same energy units as the training data
per_species_rescale_scales: null