diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 20c93a4101..092033e637 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -3,6 +3,8 @@ import os import time import shutil +import copy +import gc import numpy as np from deepmd.env import tf, paddle from deepmd.env import default_tf_session_config @@ -79,6 +81,7 @@ class DPTrainer (object): def __init__(self, jdata, run_opt): + paddle.set_device("cpu") self.run_opt = run_opt self._init_param(jdata) @@ -387,6 +390,9 @@ def train (self, % (self.cur_batch, train_time, test_time)) train_time = 0 + if self.save_freq > 0 and self.cur_batch % self.save_freq == 0: + self.save_model(model_inputs, self.save_ckpt + "/model") + if self.run_opt.is_chief: fp.close () if self.profiling and self.run_opt.is_chief : @@ -400,6 +406,7 @@ def train (self, def save_model(self, model_inputs_, folder_name_): # Since "paddle.jit.to_static" modifiess the model in-place # We have to make a temporary model copy to avoid damage to the original model. + model = copy.copy(self.model) save_path = os.getcwd() + "/" + folder_name_ if self.fitting_type == "ener" and self.descrpt_type == "se_a": input_names = ['coord', 'type', 'natoms_vec', 'box', 'default_mesh'] @@ -407,8 +414,14 @@ def save_model(self, model_inputs_, folder_name_): else: raise NotImplementedError - model = paddle.jit.to_static(self.model, input_spec=input_specs) - paddle.jit.save(model, save_path) + try: + model = paddle.jit.to_static(model, input_spec=input_specs) + paddle.jit.save(model, save_path) + except Exception as e: + raise e + finally: + del model + gc.collect() log.info("saved checkpoint to %s" % (save_path)) diff --git a/examples/water/train/water_se_a.json b/examples/water/train/water_se_a.json index 750c55c6f3..368170a77f 100644 --- a/examples/water/train/water_se_a.json +++ b/examples/water/train/water_se_a.json @@ -54,6 +54,7 @@ "disp_file": "lcurve.out", "disp_freq": 100, "numb_test": 10, + "save_freq": 1000, "save_ckpt": "model.ckpt", "load_ckpt": "model.ckpt", "disp_training":true, diff --git a/source/op/paddle_ops/srcs/pd_prod_env_mat_multi_devices_cpu.cc b/source/op/paddle_ops/srcs/pd_prod_env_mat_multi_devices_cpu.cc index 432586cac8..13592b17bd 100644 --- a/source/op/paddle_ops/srcs/pd_prod_env_mat_multi_devices_cpu.cc +++ b/source/op/paddle_ops/srcs/pd_prod_env_mat_multi_devices_cpu.cc @@ -72,10 +72,6 @@ _prepare_coord_nlist_cpu( const int &max_cpy_trial, const int &max_nnei_trial); -// Numerical regression between CUDA 10.1 & CUDA 11.2 -// Disable CUDA support until latest changes on -// /source/lib/src/cuda/xxx.cu get merged -/* #ifdef PADDLE_WITH_CUDA std::vector PdProdEnvMatAOpCUDAForward( const paddle::Tensor &coord_tensor, @@ -91,7 +87,6 @@ std::vector PdProdEnvMatAOpCUDAForward( std::vector sel_a, std::vector sel_r); #endif -*/ template void PdProdEnvMatAOpCPUForwardKernel( @@ -149,13 +144,13 @@ std::vector PdProdEnvMatAOpCPUForward( std::vector sel_a, std::vector sel_r) { - CHECK_INPUT_READY(coord_tensor); - CHECK_INPUT_READY(type_tensor); - CHECK_INPUT_READY(natoms_tensor); - CHECK_INPUT_READY(box_tensor); - CHECK_INPUT_READY(mesh_tensor); - CHECK_INPUT_READY(avg_tensor); - CHECK_INPUT_READY(std_tensor); + CHECK_INPUT(coord_tensor); + CHECK_INPUT(type_tensor); + CHECK_INPUT(natoms_tensor); + CHECK_INPUT(box_tensor); + CHECK_INPUT(mesh_tensor); + CHECK_INPUT(avg_tensor); + CHECK_INPUT(std_tensor); std::vector sec_a; std::vector sec_r; @@ -195,15 +190,7 @@ std::vector PdProdEnvMatAOpCPUForward( PD_CHECK(sec_r.back() == 0, "Rotational free descriptor only support all-angular information: sel_r should be all zero."); PD_CHECK(natoms_tensor.shape()[0] >= 3, "Number of atoms should be larger than (or equal to) 3"); // Paddle Set device on Python not in custom op - - // TODO: This code should be removed once cuda issue fixed. - const int* natoms = nullptr; - if(natoms_tensor.place() != paddle::PlaceType::kCPU){ - natoms = natoms_tensor.copy_to(paddle::PlaceType::kCPU).data(); - }else{ - natoms = natoms_tensor.data(); - } - + const int *natoms = natoms_tensor.data(); int nloc = natoms[0]; int nall = natoms[1]; int ntypes = natoms_tensor.shape()[0] - 2; //nloc and nall mean something. @@ -256,41 +243,21 @@ std::vector PdProdEnvMatAOpCPUForward( paddle::Tensor descrpt_deriv_tensor = paddle::Tensor(paddle::PlaceType::kCPU, descrpt_deriv_shape); paddle::Tensor rij_tensor = paddle::Tensor(paddle::PlaceType::kCPU, rij_shape); paddle::Tensor nlist_tensor = paddle::Tensor(paddle::PlaceType::kCPU, nlist_shape); - - if(natoms_tensor.place() == paddle::PlaceType::kCPU) { - PD_DISPATCH_FLOATING_TYPES( - coord_tensor.type(), "pd_prod_env_mat_a_cpu_forward_kernel", ([&] { - PdProdEnvMatAOpCPUForwardKernel( - nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size, - mesh_tensor.data(), nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r, - descrpt_tensor.mutable_data(), - descrpt_deriv_tensor.mutable_data(), - rij_tensor.mutable_data(), - nlist_tensor.mutable_data(), - coord_tensor.data(), - box_tensor.data(), - avg_tensor.data(), - std_tensor.data(), - type_tensor.data()); - })); - } else { - PD_DISPATCH_FLOATING_TYPES( - coord_tensor.type(), "pd_prod_env_mat_a_cpu_forward_kernel", ([&] { - PdProdEnvMatAOpCPUForwardKernel( - nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size, - mesh_tensor.size() == 0 ? mesh_tensor.data() : mesh_tensor.copy_to(paddle::PlaceType::kCPU).data(), - nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r, - descrpt_tensor.mutable_data(), - descrpt_deriv_tensor.mutable_data(), - rij_tensor.mutable_data(), - nlist_tensor.mutable_data(), - coord_tensor.copy_to(paddle::PlaceType::kCPU).data(), - box_tensor.copy_to(paddle::PlaceType::kCPU).data(), - avg_tensor.copy_to(paddle::PlaceType::kCPU).data(), - std_tensor.copy_to(paddle::PlaceType::kCPU).data(), - type_tensor.copy_to(paddle::PlaceType::kCPU).data()); - })); - } + PD_DISPATCH_FLOATING_TYPES( + coord_tensor.type(), "pd_prod_env_mat_a_cpu_forward_kernel", ([&] { + PdProdEnvMatAOpCPUForwardKernel( + nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size, + mesh_tensor.data(), nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r, + descrpt_tensor.mutable_data(), + descrpt_deriv_tensor.mutable_data(), + rij_tensor.mutable_data(), + nlist_tensor.mutable_data(), + coord_tensor.data(), + box_tensor.data(), + avg_tensor.data(), + std_tensor.data(), + type_tensor.data()); + })); return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor}; } @@ -315,23 +282,6 @@ std::vector PdProdEnvMatAOpForward( CHECK_INPUT_READY(mesh_tensor); CHECK_INPUT_READY(avg_tensor); CHECK_INPUT_READY(std_tensor); - - // Force dispatch to CPU until CUDA bug fixed - return PdProdEnvMatAOpCPUForward( - coord_tensor, - type_tensor, - natoms_tensor, - box_tensor, - mesh_tensor, - avg_tensor, - std_tensor, - rcut_a, - rcut_r, - rcut_r_smth, - sel_a, - sel_r - ); - /* if (coord_tensor.place() == paddle::PlaceType::kCPU) { return PdProdEnvMatAOpCPUForward( coord_tensor, @@ -367,7 +317,6 @@ std::vector PdProdEnvMatAOpForward( } else { PD_THROW("Not implemented."); } - */ } template static void diff --git a/source/op/paddle_ops/srcs/pd_prod_force_se_a_multi_devices_cpu.cc b/source/op/paddle_ops/srcs/pd_prod_force_se_a_multi_devices_cpu.cc index aef695f3da..a0ca9218e4 100644 --- a/source/op/paddle_ops/srcs/pd_prod_force_se_a_multi_devices_cpu.cc +++ b/source/op/paddle_ops/srcs/pd_prod_force_se_a_multi_devices_cpu.cc @@ -10,10 +10,6 @@ -// Numerical regression between CUDA 10.1 & CUDA 11.2 -// Disable CUDA support until latest changes on -// /source/lib/src/cuda/xxx.cu get merged -/* #ifdef PADDLE_WITH_CUDA std::vector PdProdForceSeAOpCUDAForward( const paddle::Tensor& net_deriv_tensor, @@ -23,7 +19,6 @@ const paddle::Tensor& natoms_tensor, int n_a_sel, int n_r_sel); #endif -*/ template void PdProdForceSeAOpForwardCPUKernel( @@ -49,10 +44,10 @@ const paddle::Tensor& natoms_tensor, int n_a_sel, int n_r_sel ){ - CHECK_INPUT_READY(net_deriv_tensor); - CHECK_INPUT_READY(in_deriv_tensor); - CHECK_INPUT_READY(nlist_tensor); - CHECK_INPUT_READY(natoms_tensor); + CHECK_INPUT(net_deriv_tensor); + CHECK_INPUT(in_deriv_tensor); + CHECK_INPUT(nlist_tensor); + CHECK_INPUT(natoms_tensor); CHECK_INPUT_DIM(net_deriv_tensor, 2); CHECK_INPUT_DIM(in_deriv_tensor, 2); @@ -60,13 +55,7 @@ int n_r_sel CHECK_INPUT_DIM(natoms_tensor, 1); PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3"); - // TODO: This code should be removed once cuda issue fixed. - const int* natoms = nullptr; - if(natoms_tensor.place() != paddle::PlaceType::kCPU){ - natoms = natoms_tensor.copy_to(paddle::PlaceType::kCPU).data(); - }else{ - natoms = natoms_tensor.data(); - } + const int* natoms = natoms_tensor.data(); int nloc = natoms[0]; int nall = natoms[1]; int nframes = net_deriv_tensor.shape()[0]; @@ -90,24 +79,13 @@ int n_r_sel assert (nloc * nnei == nlist_tensor.shape()[1]); assert (nnei * 4 == ndescrpt); - if(natoms_tensor.place() == paddle::PlaceType::kCPU){ - PD_DISPATCH_FLOATING_TYPES( - net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] { - PdProdForceSeAOpForwardCPUKernel( - nloc, nall, nframes, ndescrpt, nnei, - force_tensor.mutable_data(), net_deriv_tensor.data(), - in_deriv_tensor.data(), nlist_tensor.data()); - })); - } else { - PD_DISPATCH_FLOATING_TYPES( - net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] { - PdProdForceSeAOpForwardCPUKernel( - nloc, nall, nframes, ndescrpt, nnei, - force_tensor.mutable_data(), net_deriv_tensor.copy_to(paddle::PlaceType::kCPU).data(), - in_deriv_tensor.copy_to(paddle::PlaceType::kCPU).data(), nlist_tensor.copy_to(paddle::PlaceType::kCPU).data()); - })); - - } + PD_DISPATCH_FLOATING_TYPES( + net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] { + PdProdForceSeAOpForwardCPUKernel( + nloc, nall, nframes, ndescrpt, nnei, + force_tensor.mutable_data(), net_deriv_tensor.data(), + in_deriv_tensor.data(), nlist_tensor.data()); + })); return {force_tensor}; } @@ -221,9 +199,6 @@ const paddle::Tensor& nlist_tensor, const paddle::Tensor& natoms_tensor, int n_a_sel, int n_r_sel){ - // Force dispatch to CPU until CUDA bug fixed - return PdProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel); - /* if(net_deriv_tensor.place() == paddle::PlaceType::kCPU){ return PdProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel); #ifdef PADDLE_WITH_CUDA @@ -233,7 +208,6 @@ int n_r_sel){ }else{ PD_THROW("No Such kernel for PdFrodForceSeAForward!"); } - */ } std::vector PdProdForceSeABackward( diff --git a/source/op/paddle_ops/srcs/pd_prod_virial_se_a_multi_devices_cpu.cc b/source/op/paddle_ops/srcs/pd_prod_virial_se_a_multi_devices_cpu.cc index c652fa0f57..43b8740d0e 100644 --- a/source/op/paddle_ops/srcs/pd_prod_virial_se_a_multi_devices_cpu.cc +++ b/source/op/paddle_ops/srcs/pd_prod_virial_se_a_multi_devices_cpu.cc @@ -9,10 +9,6 @@ #define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") -// Numerical regression between CUDA 10.1 & CUDA 11.2 -// Disable CUDA support until latest changes on -// /source/lib/src/cuda/xxx.cu get merged -/* #ifdef PADDLE_WITH_CUDA std::vector PdProdVirialSeAOpCUDAForward( const paddle::Tensor& net_deriv_tensor, @@ -23,7 +19,6 @@ const paddle::Tensor& natoms_tensor, int n_a_sel, int n_r_sel); #endif -*/ template void PdProdVirialSeAOpForwardCPUKernel( diff --git a/source/tests/test_pd_prod_force_and_virial.py b/source/tests/test_pd_prod_force_and_virial.py index 4b1c57db9c..a71e2d44c0 100644 --- a/source/tests/test_pd_prod_force_and_virial.py +++ b/source/tests/test_pd_prod_force_and_virial.py @@ -18,10 +18,11 @@ from tensorflow.python.framework import ops from common import Data + if GLOBAL_NP_FLOAT_PRECISION == np.float32 : global_default_fv_hh = 1e-2 global_default_dw_hh = 1e-2 - global_default_places = 2 + global_default_places = 3 else : global_default_fv_hh = 1e-5 global_default_dw_hh = 1e-4