From adf640a49cad144d707686e629f24756e28968f6 Mon Sep 17 00:00:00 2001 From: Chenxing Luo Date: Sat, 16 Mar 2024 18:53:43 -0400 Subject: [PATCH 1/3] Fix LAMMPS plugin symlink path on macOS platform --- source/lmp/plugin/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/lmp/plugin/CMakeLists.txt b/source/lmp/plugin/CMakeLists.txt index bfc2253412..4fdae7ac5b 100644 --- a/source/lmp/plugin/CMakeLists.txt +++ b/source/lmp/plugin/CMakeLists.txt @@ -126,7 +126,7 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION) install( CODE "execute_process( \ COMMAND ${CMAKE_COMMAND} -E create_symlink \ - ../${CMAKE_SHARED_LIBRARY_PREFIX}${libname}${CMAKE_SHARED_LIBRARY_SUFFIX} \ + ../${CMAKE_SHARED_MODULE_PREFIX}${libname}${CMAKE_SHARED_MODULE_SUFFIX} \ ${CMAKE_INSTALL_PREFIX}/lib/${libname}/${PLUGINNAME} \ )") endif() From 096db6aaed3b19ea3160388b901c363bed9f38e4 Mon Sep 17 00:00:00 2001 From: Chenxing Luo Date: Sun, 7 Apr 2024 02:35:32 -0400 Subject: [PATCH 2/3] Fix error when distributed training and data loading is unavailable --- deepmd/pt/entrypoints/main.py | 2 +- deepmd/pt/optimizer/LKF.py | 2 +- deepmd/pt/train/training.py | 22 +++++++++++----------- deepmd/pt/utils/dataloader.py | 6 +++--- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 77091c3cf7..63463ceeef 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -159,7 +159,7 @@ def prepare_trainer_input_single( stat_file_path_single, ) - rank = dist.get_rank() if dist.is_initialized() else 0 + rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 if not multi_task: ( train_data, diff --git a/deepmd/pt/optimizer/LKF.py b/deepmd/pt/optimizer/LKF.py index 06b341d987..6196414243 100644 --- a/deepmd/pt/optimizer/LKF.py +++ b/deepmd/pt/optimizer/LKF.py @@ -47,7 +47,7 @@ def __init__( # the first param, because this helps with casting in load_state_dict self._state = self.state[self._params[0]] self._state.setdefault("kalman_lambda", kalman_lambda) - self.dist_init = dist.is_initialized() + self.dist_init = dist.is_available() and dist.is_initialized() self.rank = dist.get_rank() if self.dist_init else 0 self.dindex = [] self.remainder = 0 diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 73404b0c83..1cdc383d01 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -122,8 +122,8 @@ def __init__( self.model_keys = ( list(model_params["model_dict"]) if self.multi_task else ["Default"] ) - self.rank = dist.get_rank() if dist.is_initialized() else 0 - self.world_size = dist.get_world_size() if dist.is_initialized() else 1 + self.rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 + self.world_size = dist.get_world_size() if dist.is_available() and dist.is_initialized() else 1 self.num_model = len(self.model_keys) # Iteration config @@ -169,7 +169,7 @@ def get_dataloader_and_buffer(_data, _params): _data, sampler=_sampler, batch_size=None, - num_workers=NUM_WORKERS, # setting to 0 diverges the behavior of its iterator; should be >=1 + num_workers=NUM_WORKERS if dist.is_available() else 0, # setting to 0 diverges the behavior of its iterator; should be >=1 drop_last=False, pin_memory=True, ) @@ -607,7 +607,7 @@ def single_model_finetune( if shared_links is not None: self.wrapper.share_params(shared_links, resume=resuming or self.rank != 0) - if dist.is_initialized(): + if dist.is_available() and dist.is_initialized(): torch.cuda.set_device(LOCAL_RANK) # DDP will guarantee the model parameters are identical across all processes self.wrapper = DDP( @@ -673,7 +673,7 @@ def run(self): record_file = f"Sample_rank_{self.rank}.txt" fout1 = open(record_file, mode="w", buffering=1) log.info("Start to train %d steps.", self.num_steps) - if dist.is_initialized(): + if dist.is_available() and dist.is_initialized(): log.info(f"Rank: {dist.get_rank()}/{dist.get_world_size()}") if self.enable_tensorboard: from torch.utils.tensorboard import ( @@ -734,7 +734,7 @@ def step(_step_id, task_key="Default"): elif self.opt_type == "LKF": if isinstance(self.loss, EnergyStdLoss): KFOptWrapper = KFOptimizerWrapper( - self.wrapper, self.optimizer, 24, 6, dist.is_initialized() + self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized() ) pref_e = self.opt_param["kf_start_pref_e"] * ( self.opt_param["kf_limit_pref_e"] @@ -753,7 +753,7 @@ def step(_step_id, task_key="Default"): # [coord, atype, natoms, mapping, shift, nlist, box] model_pred = {"energy": p_energy, "force": p_force} module = ( - self.wrapper.module if dist.is_initialized() else self.wrapper + self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper ) def fake_model(): @@ -768,10 +768,10 @@ def fake_model(): ) elif isinstance(self.loss, DenoiseLoss): KFOptWrapper = KFOptimizerWrapper( - self.wrapper, self.optimizer, 24, 6, dist.is_initialized() + self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized() ) module = ( - self.wrapper.module if dist.is_initialized() else self.wrapper + self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper ) model_pred = KFOptWrapper.update_denoise_coord( input_dict, @@ -924,7 +924,7 @@ def log_loss_valid(_task_key="Default"): # Handle the case if rank 0 aborted and re-assigned self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pt") - module = self.wrapper.module if dist.is_initialized() else self.wrapper + module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper self.save_model(self.latest_model, lr=cur_lr, step=_step_id) log.info(f"Saved model to {self.latest_model}") symlink_prefix_files(self.latest_model.stem, self.save_ckpt) @@ -990,7 +990,7 @@ def log_loss_valid(_task_key="Default"): prof.stop() def save_model(self, save_path, lr=0.0, step=0): - module = self.wrapper.module if dist.is_initialized() else self.wrapper + module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper module.train_infos["lr"] = lr module.train_infos["step"] = step torch.save( diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 361bc4b0b6..4705c6d0b4 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -97,7 +97,7 @@ def construct_dataset(system): with Pool( os.cpu_count() - // (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_initialized() else 1) + // (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_available() and dist.is_initialized() else 1) ) as pool: self.systems = pool.map(construct_dataset, systems) @@ -127,7 +127,7 @@ def construct_dataset(system): self.batch_sizes = batch_size * np.ones(len(systems), dtype=int) assert len(self.systems) == len(self.batch_sizes) for system, batch_size in zip(self.systems, self.batch_sizes): - if dist.is_initialized(): + if dist.is_available() and dist.is_initialized(): system_sampler = DistributedSampler(system) self.sampler_list.append(system_sampler) else: @@ -138,7 +138,7 @@ def construct_dataset(system): num_workers=0, # Should be 0 to avoid too many threads forked sampler=system_sampler, collate_fn=collate_batch, - shuffle=(not dist.is_initialized()) and shuffle, + shuffle=(not (dist.is_available() and dist.is_initialized())) and shuffle, ) self.dataloaders.append(system_dataloader) self.index.append(len(system_dataloader)) From e9c7a0e94b0a65ba8be3b98e442b542ee3f143e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 7 Apr 2024 06:50:27 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pt/train/training.py | 46 ++++++++++++++++++++++++++++------- deepmd/pt/utils/dataloader.py | 9 +++++-- 2 files changed, 44 insertions(+), 11 deletions(-) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 1cdc383d01..030e9ffdfb 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -122,8 +122,14 @@ def __init__( self.model_keys = ( list(model_params["model_dict"]) if self.multi_task else ["Default"] ) - self.rank = dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 - self.world_size = dist.get_world_size() if dist.is_available() and dist.is_initialized() else 1 + self.rank = ( + dist.get_rank() if dist.is_available() and dist.is_initialized() else 0 + ) + self.world_size = ( + dist.get_world_size() + if dist.is_available() and dist.is_initialized() + else 1 + ) self.num_model = len(self.model_keys) # Iteration config @@ -169,7 +175,9 @@ def get_dataloader_and_buffer(_data, _params): _data, sampler=_sampler, batch_size=None, - num_workers=NUM_WORKERS if dist.is_available() else 0, # setting to 0 diverges the behavior of its iterator; should be >=1 + num_workers=NUM_WORKERS + if dist.is_available() + else 0, # setting to 0 diverges the behavior of its iterator; should be >=1 drop_last=False, pin_memory=True, ) @@ -734,7 +742,11 @@ def step(_step_id, task_key="Default"): elif self.opt_type == "LKF": if isinstance(self.loss, EnergyStdLoss): KFOptWrapper = KFOptimizerWrapper( - self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized() + self.wrapper, + self.optimizer, + 24, + 6, + dist.is_available() and dist.is_initialized(), ) pref_e = self.opt_param["kf_start_pref_e"] * ( self.opt_param["kf_limit_pref_e"] @@ -753,7 +765,9 @@ def step(_step_id, task_key="Default"): # [coord, atype, natoms, mapping, shift, nlist, box] model_pred = {"energy": p_energy, "force": p_force} module = ( - self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper + self.wrapper.module + if dist.is_available() and dist.is_initialized() + else self.wrapper ) def fake_model(): @@ -768,10 +782,16 @@ def fake_model(): ) elif isinstance(self.loss, DenoiseLoss): KFOptWrapper = KFOptimizerWrapper( - self.wrapper, self.optimizer, 24, 6, dist.is_available() and dist.is_initialized() + self.wrapper, + self.optimizer, + 24, + 6, + dist.is_available() and dist.is_initialized(), ) module = ( - self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper + self.wrapper.module + if dist.is_available() and dist.is_initialized() + else self.wrapper ) model_pred = KFOptWrapper.update_denoise_coord( input_dict, @@ -924,7 +944,11 @@ def log_loss_valid(_task_key="Default"): # Handle the case if rank 0 aborted and re-assigned self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pt") - module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper + module = ( + self.wrapper.module + if dist.is_available() and dist.is_initialized() + else self.wrapper + ) self.save_model(self.latest_model, lr=cur_lr, step=_step_id) log.info(f"Saved model to {self.latest_model}") symlink_prefix_files(self.latest_model.stem, self.save_ckpt) @@ -990,7 +1014,11 @@ def log_loss_valid(_task_key="Default"): prof.stop() def save_model(self, save_path, lr=0.0, step=0): - module = self.wrapper.module if dist.is_available() and dist.is_initialized() else self.wrapper + module = ( + self.wrapper.module + if dist.is_available() and dist.is_initialized() + else self.wrapper + ) module.train_infos["lr"] = lr module.train_infos["step"] = step torch.save( diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py index 4705c6d0b4..8ebe75868e 100644 --- a/deepmd/pt/utils/dataloader.py +++ b/deepmd/pt/utils/dataloader.py @@ -97,7 +97,11 @@ def construct_dataset(system): with Pool( os.cpu_count() - // (int(os.environ["LOCAL_WORLD_SIZE"]) if dist.is_available() and dist.is_initialized() else 1) + // ( + int(os.environ["LOCAL_WORLD_SIZE"]) + if dist.is_available() and dist.is_initialized() + else 1 + ) ) as pool: self.systems = pool.map(construct_dataset, systems) @@ -138,7 +142,8 @@ def construct_dataset(system): num_workers=0, # Should be 0 to avoid too many threads forked sampler=system_sampler, collate_fn=collate_batch, - shuffle=(not (dist.is_available() and dist.is_initialized())) and shuffle, + shuffle=(not (dist.is_available() and dist.is_initialized())) + and shuffle, ) self.dataloaders.append(system_dataloader) self.index.append(len(system_dataloader))