From 31dc753cb73029f55732f781ec6cda2ea1d4507b Mon Sep 17 00:00:00 2001 From: Wei-Tse Hsu Date: Sat, 26 Aug 2023 02:51:20 -0600 Subject: [PATCH] Improved error handling in run_EEXE.py --- ensemble_md/cli/run_EEXE.py | 180 ++++++++++++++++++++---------------- ensemble_md/ensemble_EXE.py | 2 +- 2 files changed, 99 insertions(+), 83 deletions(-) diff --git a/ensemble_md/cli/run_EEXE.py b/ensemble_md/cli/run_EEXE.py index 66307885..0fb2b1d0 100644 --- a/ensemble_md/cli/run_EEXE.py +++ b/ensemble_md/cli/run_EEXE.py @@ -12,12 +12,14 @@ import time import copy import shutil +import traceback import argparse import numpy as np from mpi4py import MPI from datetime import datetime from ensemble_md.utils import utils +from ensemble_md.utils import gmx_parser from ensemble_md.ensemble_EXE import EnsembleEXE @@ -134,72 +136,81 @@ def main(): EEXE.get_ref_dist() for i in range(start_idx, EEXE.n_iter): - if rank == 0: - # Step 3: Swap the coordinates - # 3-1. For all the replica simulations, - # (1) Find the last sampled state and the corresponding lambda values from the DHDL files. - # (2) Find the final Wang-Landau incrementors and weights from the LOG files. - dhdl_files = [f'sim_{j}/iteration_{i - 1}/dhdl.xvg' for j in range(EEXE.n_sim)] - log_files = [f'sim_{j}/iteration_{i - 1}/md.log' for j in range(EEXE.n_sim)] - states_ = EEXE.extract_final_dhdl_info(dhdl_files) - wl_delta, weights_, counts = EEXE.extract_final_log_info(log_files) - print() - - # 3-2. Identify swappable pairs, propose swap(s), calculate P_acc, and accept/reject swap(s) - # Note after `get_swapping_pattern`, `states_` and `weights_` won't be necessarily - # since they are updated by `get_swapping_pattern`. (Even if the function does not explicitly - # returns `states_` and `weights_`, `states_` and `weights_` can still be different after - # the use of the function.) Therefore, here we create copyes for `states_` and `weights_` - # before the use of `get_swapping_pattern`, so we can use them in `histogram_correction`, - # `combine_weights` and `update_MDP`. - states = copy.deepcopy(states_) - weights = copy.deepcopy(weights_) - swap_pattern, swap_list = EEXE.get_swapping_pattern(dhdl_files, states_, weights_) # swap_list will only be used for modify_coords # noqa: E501 - - # 3-3. Calculate the weights averaged since the last update of the Wang-Landau incrementor. - # Note that the averaged weights are used for histogram correction/weight combination. - # For the calculation of the acceptance ratio (inside get_swapping_pattern), final weights should be used. - if EEXE.N_cutoff != -1 or EEXE.w_combine is True: - # Only when histogram correction/weight combination is needed. - weights_avg, weights_err = EEXE.get_averaged_weights(log_files) - - # 3-4. Perform histogram correction/weight combination - # Note that we never use final weights but averaged weights here. - # The product of this step should always be named as "weights" to be used in update_MDP - if wl_delta != [None for i in range(EEXE.n_sim)]: # weight-updating - print(f'\nCurrent Wang-Landau incrementors: {wl_delta}') - if EEXE.N_cutoff != -1 and EEXE.w_combine is True: - # perform both - weights_avg = EEXE.histogram_correction(weights_avg, counts) - weights, g_vec = EEXE.combine_weights(weights_avg) # inverse-variance weighting seems worse - EEXE.g_vecs.append(g_vec) - elif EEXE.N_cutoff == -1 and EEXE.w_combine is True: - # only perform weight combination - print('\nNote: No histogram correction will be performed.') - weights, g_vec = EEXE.combine_weights(weights_avg) # inverse-variance weighting seems worse - EEXE.g_vecs.append(g_vec) - elif EEXE.N_cutoff != -1 and EEXE.w_combine is False: - # only perform histogram correction - print('\nNote: No weight combination will be performed.') - weights = EEXE.histogram_correction(weights_avg, counts) + # For a large code block like below executed on rank 0, we try to catch any exception and abort the simulation. + # So if there is bug, the execution will be terminated and no computation time will be wasted. + try: + if rank == 0: + # Step 3: Swap the coordinates + # 3-1. For all the replica simulations, + # (1) Find the last sampled state and the corresponding lambda values from the DHDL files. + # (2) Find the final Wang-Landau incrementors and weights from the LOG files. + dhdl_files = [f'sim_{j}/iteration_{i - 1}/dhdl.xvg' for j in range(EEXE.n_sim)] + log_files = [f'sim_{j}/iteration_{i - 1}/md.log' for j in range(EEXE.n_sim)] + states_ = EEXE.extract_final_dhdl_info(dhdl_files) + wl_delta, weights_, counts = EEXE.extract_final_log_info(log_files) + print() + + # 3-2. Identify swappable pairs, propose swap(s), calculate P_acc, and accept/reject swap(s) + # Note after `get_swapping_pattern`, `states_` and `weights_` won't be necessarily + # since they are updated by `get_swapping_pattern`. (Even if the function does not explicitly + # returns `states_` and `weights_`, `states_` and `weights_` can still be different after + # the use of the function.) Therefore, here we create copyes for `states_` and `weights_` + # before the use of `get_swapping_pattern`, so we can use them in `histogram_correction`, + # `combine_weights` and `update_MDP`. + states = copy.deepcopy(states_) + weights = copy.deepcopy(weights_) + swap_pattern, swap_list = EEXE.get_swapping_pattern(dhdl_files, states_, weights_) # swap_list will only be used for modify_coords # noqa: E501 + + # 3-3. Calculate the weights averaged since the last update of the Wang-Landau incrementor. + # Note that the averaged weights are used for histogram correction/weight combination. + # For calculating the acceptance ratio (inside get_swapping_pattern), final weights should be used. + if EEXE.N_cutoff != -1 or EEXE.w_combine is True: + # Only when histogram correction/weight combination is needed. + weights_avg, weights_err = EEXE.get_averaged_weights(log_files) + + # 3-4. Perform histogram correction/weight combination + # Note that we never use final weights but averaged weights here. + # The product of this step should always be named as "weights" to be used in update_MDP + if wl_delta != [None for i in range(EEXE.n_sim)]: # weight-updating + print(f'\nCurrent Wang-Landau incrementors: {wl_delta}') + if EEXE.N_cutoff != -1 and EEXE.w_combine is True: + # perform both + weights_avg = EEXE.histogram_correction(weights_avg, counts) + weights, g_vec = EEXE.combine_weights(weights_avg) # inverse-variance weighting seems worse + EEXE.g_vecs.append(g_vec) + elif EEXE.N_cutoff == -1 and EEXE.w_combine is True: + # only perform weight combination + print('\nNote: No histogram correction will be performed.') + weights, g_vec = EEXE.combine_weights(weights_avg) # inverse-variance weighting seems worse + EEXE.g_vecs.append(g_vec) + elif EEXE.N_cutoff != -1 and EEXE.w_combine is False: + # only perform histogram correction + print('\nNote: No weight combination will be performed.') + weights = EEXE.histogram_correction(weights_avg, counts) + else: + weights_current = [gmx_parser.parse_log(log_files[i])[0][-1] for i in range(EEXE.n_sim)] + w_for_printing = EEXE.combine_weights(weights_current)[1] + print('\nNote: No histogram correction will be performed.') + print('Note: No weight combination will be performed.') + print(f'The alchemical weights of all states: \n {list(np.round(w_for_printing, decimals=3))}') + + # 3-5. Modify the MDP files and swap out the GRO files (if needed) + # Here we keep the lambda range set in mdp the same across different iterations in the same folder but swap out the gro file # noqa: E501 + # Note we use states (copy of states_) instead of states_ in update_MDP. + for j in list(range(EEXE.n_sim)): + os.mkdir(f'sim_{j}/iteration_{i}') + MDP = EEXE.update_MDP(f"sim_{j}/iteration_{i - 1}/expanded.mdp", j, i, states, wl_delta, weights, counts) # modify with a new template # noqa: E501 + MDP.write(f"sim_{j}/iteration_{i}/expanded.mdp", skipempty=True) + # In run_EEXE(i, swap_pattern), where the tpr files will be generated, we use the top file at the + # level of the simulation (the file that will be shared by all simulations). For the gro file, we + # pass swap_pattern to the function to figure it out internally. else: - w_for_printing = EEXE.combine_weights(weights_avg)[1] - print('\nNote: No histogram correction will be performed.') - print('Note: No weight combination will be performed.') - print(f'The alchemical weights of all states: \n {list(np.round(w_for_printing, decimals=3))}') - - # 3-5. Modify the MDP files and swap out the GRO files (if needed) - # Here we keep the lambda range set in mdp the same across different iterations in the same folder but swap out the gro file # noqa: E501 - # Note we use states (copy of states_) instead of states_ in update_MDP. - for j in list(range(EEXE.n_sim)): - os.mkdir(f'sim_{j}/iteration_{i}') - MDP = EEXE.update_MDP(f"sim_{j}/iteration_{i - 1}/expanded.mdp", j, i, states, wl_delta, weights, counts) # modify with a new template # noqa: E501 - MDP.write(f"sim_{j}/iteration_{i}/expanded.mdp", skipempty=True) - # In run_EEXE(i, swap_pattern), where the tpr files will be generated, we use the top file at the - # level of the simulation (the file that will be shared by all simulations). For the gro file, we pass - # swap_pattern to the function to figure it out internally. - else: - swap_pattern, swap_list = None, None + swap_pattern, swap_list = None, None + + except Exception: + print('\n--------------------------------------------------------------------------\n') + print(f'An error occurred on rank 0:\n{traceback.format_exc()}') + MPI.COMM_WORLD.Abort(1) if -1 not in EEXE.equil and 0 not in EEXE.equil: # This is the case where the weights are equilibrated in a weight-updating simulation. @@ -217,22 +228,27 @@ def main(): pass else: if EEXE.modify_coords_fn is not None: - if rank == 0: - for j in range(len(swap_list)): - print('\nModifying the coordinates of the following output GRO files ...') - # gro_1 and gro_2 are the simlation outputs (that we want to back up) and the inputs to modify_coords # noqa: E501 - gro_1 = f'sim_{swap_list[j][0]}/iteration_{i-1}/confout.gro' - gro_2 = f'sim_{swap_list[j][1]}/iteration_{i-1}/confout.gro' - print(f' - {gro_1}\n - {gro_2}') - - # Now we rename gro_1 and gro_2 to back them up - gro_1_backup = gro_1.split('.gro')[0] + '_backup.gro' - gro_2_backup = gro_2.split('.gro')[0] + '_backup.gro' - os.rename(gro_1, gro_1_backup) - os.rename(gro_2, gro_2_backup) - - # Here we input gro_1_backup and gro_2_backup and modify_coords_fn will save the modified gro files as gro_1 and gro_2 # noqa: E501 - EEXE.modify_coords_fn(gro_1_backup, gro_2_backup) # the order should not matter + try: + if rank == 0: + for j in range(len(swap_list)): + print('\nModifying the coordinates of the following output GRO files ...') + # gro_1 and gro_2 are the simlation outputs (that we want to back up) and the inputs to modify_coords # noqa: E501 + gro_1 = f'sim_{swap_list[j][0]}/iteration_{i-1}/confout.gro' + gro_2 = f'sim_{swap_list[j][1]}/iteration_{i-1}/confout.gro' + print(f' - {gro_1}\n - {gro_2}') + + # Now we rename gro_1 and gro_2 to back them up + gro_1_backup = gro_1.split('.gro')[0] + '_backup.gro' + gro_2_backup = gro_2.split('.gro')[0] + '_backup.gro' + os.rename(gro_1, gro_1_backup) + os.rename(gro_2, gro_2_backup) + + # Here we input gro_1_backup and gro_2_backup and modify_coords_fn will save the modified gro files as gro_1 and gro_2 # noqa: E501 + EEXE.modify_coords_fn(gro_1_backup, gro_2_backup) # the order should not matter + except Exception: + print('\n--------------------------------------------------------------------------\n') + print(f'\nAn error occurred on rank 0:\n{traceback.format_exc()}') + MPI.COMM_WORLD.Abort(1) # 4-2. Run another ensemble of simulations EEXE.run_EEXE(i, swap_pattern) @@ -278,4 +294,4 @@ def main(): print(f'\nTime elapsed: {utils.format_time(time.time() - t1)}') - MPI.COMM_WORLD.Abort(0) + MPI.Finalize() diff --git a/ensemble_md/ensemble_EXE.py b/ensemble_md/ensemble_EXE.py index 6c64dca8..d6d8f0d1 100644 --- a/ensemble_md/ensemble_EXE.py +++ b/ensemble_md/ensemble_EXE.py @@ -1229,7 +1229,7 @@ def combine_weights(self, weights, weights_err=None): if self.verbose is True: w = np.round(weights, decimals=3).tolist() # just for printing - print('\n Modified weights:') + print('\n Combined weights:') for i in range(len(w)): print(f' Rep {i}: {w[i]}')