diff --git a/docs/_static/REXEE_illustration.png b/docs/_static/REXEE_illustration.png new file mode 100644 index 00000000..79697790 Binary files /dev/null and b/docs/_static/REXEE_illustration.png differ diff --git a/docs/_static/REXEE_more_configs.png b/docs/_static/REXEE_more_configs.png new file mode 100644 index 00000000..bfbd642b Binary files /dev/null and b/docs/_static/REXEE_more_configs.png differ diff --git a/docs/_static/algorithm.png b/docs/_static/algorithm.png new file mode 100644 index 00000000..56469132 Binary files /dev/null and b/docs/_static/algorithm.png differ diff --git a/docs/_static/css/custom.css b/docs/_static/css/custom.css new file mode 100644 index 00000000..c7d1eb11 --- /dev/null +++ b/docs/_static/css/custom.css @@ -0,0 +1,6 @@ +.math { + text-align: left; +} +.eqno { + float: right; +} \ No newline at end of file diff --git a/docs/analysis.rst b/docs/analysis.rst deleted file mode 100644 index 3022e660..00000000 --- a/docs/analysis.rst +++ /dev/null @@ -1,28 +0,0 @@ -1. An overview -============== -Automated data analysis of an REXEE simulation is allowed by the CLI :code:`analyze_REXEE`, which -share the same input YAML file as the CLI :code:`run_REXEE`. Relevant parameters specified in the YAML -file for data analysis can be found in this section: :ref:`doc_analysis_params`. - -- Analysis based on transitions between replicas -- Analysis based on transitions between states -- Analysis based on Markov state models -- Free energy calculations - - -2. Transition matrix -==================== -2.1. Theoretical and experimental transition matrix ---------------------------------------------------- - -2.2. State transition matrix ----------------------------- - -2.3. Replica transition matrix ------------------------------- - -3. Overlap matrix -================= - -4. Free energy calculation -========================== diff --git a/docs/api/api_ensemble_EXE.rst b/docs/api/api_replica_exchange_EE.rst similarity index 100% rename from docs/api/api_ensemble_EXE.rst rename to docs/api/api_replica_exchange_EE.rst diff --git a/docs/conf.py b/docs/conf.py index 8119210b..befb2c4c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -52,6 +52,8 @@ 'sphinx.ext.extlinks', 'sphinx.ext.todo', 'nbsphinx', + 'sphinx_collapse', + # 'sphinxcontrib.pseudocode', ] autosummary_generate = True @@ -106,6 +108,8 @@ # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['_static'] +html_css_files = ['css/custom.css'] + # Custom sidebar templates, must be a dictionary that maps document names # to template names. # @@ -191,4 +195,6 @@ 'pandas': ('https://pandas.pydata.org/pandas-docs/stable/', None), 'pymbar': ('https://pymbar.readthedocs.io/en/latest/', None), 'alchemlyb': ('https://alchemlyb.readthedocs.io/en/latest/', None), -} \ No newline at end of file +} + +numfig = True # necessary for the algorithm to show (see https://github.com/xxks-kkk/sphinxcontrib-pseudocode/issues/18) \ No newline at end of file diff --git a/docs/examples/run_REXEE_modify_inputs.ipynb b/docs/examples/run_REXEE_modify_inputs.ipynb index 2903fcdb..bc7b0c3f 100644 --- a/docs/examples/run_REXEE_modify_inputs.ipynb +++ b/docs/examples/run_REXEE_modify_inputs.ipynb @@ -5,7 +5,7 @@ "id": "87324540", "metadata": {}, "source": [ - "# Tutorial 3: REXEE for multiple serial mutations" + "# Tutorial 3: Multi-topology REXEE (MT-REXEE) simulations" ] }, { diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 9287a4d8..ce878058 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -1,23 +1,18 @@ 1. Introduction =============== :code:`ensemble_md` is a Python package that provides methods for setting up, -running, and analyzing GROMACS simulation ensembles. The current implementation is -mainly for synchronous replica exchange (REX) of expanded ensemble (EE), abbreviated as -REXEE. In the future, we will develop methods like asynchronous REXEE, or multi-topology REXEE. -In the current implementation, the module :code:`subprocess` -is used to launch GROMACS commands, but we will switch to `SCALE-MS`_ for this purpose -in the future when possible. - - -.. _`SCALE-MS`: https://scale-ms.readthedocs.io/en/latest/ +running, and analyzing GROMACS simulation ensembles. Currently, the package implements +all the necessary algorithms for running synchronous replica exchange (REX) of expanded ensembles (EE), abbreviated as +REXEE, as well as its multi-topology (MT) variation MT-REXEE. Our future work includes +implementing asynchronous REXEE and other possible variations of the REXEE method. 2. Installation =============== 2.1. Requirements ----------------- -Before installing :code:`ensemble_md`, one should have working versions of `GROMACS`_. Please refer to the linked documentation for full installation instructions. -All the other pip-installable dependencies of :code:`ensemble_md` (specified in :code:`setup.py` of the package) +Before installing :code:`ensemble_md`, one should have a working version of `GROMACS`_. Please refer to the GROMACS documentation for full installation instructions. +All the other pip-installable dependencies required by :code:`ensemble_md` (specified in :code:`setup.py` of the package) will be automatically installed during the installation of the package. .. _`GROMACS`: https://manual.gromacs.org/current/install-guide/index.html @@ -32,31 +27,36 @@ will be automatically installed during the installation of the package. 2.3. Installation from source ----------------------------- One can also install :code:`ensemble_md` from the source code, which is available in our -`github repository`_. Specifically, execute the following commands: +`GitHub repository`_. Specifically, one can execute the following commands: :: git clone https://github.com/wehs7661/ensemble_md.git cd ensemble_md/ pip install . -If you are interested in contributing to the project, append the -last command with the flag :code:`-e` to install the project in the editable mode -so that changes you make in the source code will take effects without re-installation of the package. -(Pull requests to the project repository are welcome!) +If you would like to install the package in the editable mode, simply append the last command with the flag :code:`-e` +so that changes you make in the source code will take effect without re-installation of the package. This is particularly +useful if you would like to contribute to the development of the package. (Pull requests and issues are always welcome!) -.. _`github repository`: https://github.com/wehs7661/ensemble_md.git +.. _`GitHub repository`: https://github.com/wehs7661/ensemble_md.git 3. Testing ========== -To perform unit tests for this package, execute the following command in the home directory of the project: +3.1. Tests for the functions that do not use MPI +------------------------------------------------ +Most of the tests in this package do not require MPI. To perform unit tests for these functions, execute the following command in the home directory of the project: :: pytest -vv --disable-pytest-warnings --cov=ensemble_md --cov-report=xml --color=yes ensemble_md/tests/ -or +Note that the flags :code:`--cov` and :code:`--cov-report` are just for generating a coverage report and can be omitted. +These flags require that :code:`pytest-cov` be installed. +3.2. Tests for the functions that use MPI +----------------------------------------- +For the tests that require MPI (all implemented in :code:`tests/test_mpi_func.py`), one can use the following command: :: - python -m pytest -vv --disable-pytest-warnings --cov=ensemble_md --cov-report=xml --color=yes ensemble_md/tests/ + mpirun -np 4 pytest -vv --disable-pytest-warnings --cov=ensemble_md --cov-report=xml --color=yes ensemble_md/tests/test_mpi_func.py --with-mpi -Note that the flags :code:`--cov` and :code:`--cov-report` require that :code:`pytest-cov` be installed. \ No newline at end of file +Note that the flag :code:`--with-mpi` requires that :code:`pytest-mpi` be installed. diff --git a/docs/index.rst b/docs/index.rst index 9d074720..a2d4ca6d 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -25,11 +25,6 @@ the future. simulations :maxdepth: 2 :caption: Launching REXEE simulations: - -.. toctree:: - analysis - :maxdepth: 2 - :caption: Data analysis: .. toctree:: examples/run_REXEE @@ -39,12 +34,17 @@ the future. :caption: Tutorials: .. toctree:: - api/api_ensemble_EXE + api/api_replica_exchange_EE.rst api/api_analysis api/api_utils :maxdepth: 2 :caption: API documentation: +.. toctree:: + references + :maxdepth: 1 + :caption: References: + Others ====== diff --git a/docs/references.rst b/docs/references.rst new file mode 100644 index 00000000..37131501 --- /dev/null +++ b/docs/references.rst @@ -0,0 +1,6 @@ +.. -*- coding: utf-8 -*- + +References +========== + +.. [Hsu2024] Hsu, Wei-Tse, and Michael R. Shirts. "Replica exchange of expanded ensembles: A generalized ensemble approach with enhanced flexibility and parallelizability." arXiv preprint arXiv:2308.06938 (2023). doi: `10.48550/arXiv.2308.06938 `_. \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index eed0dac8..934b419b 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -11,4 +11,6 @@ seaborn matplotlib pyemma pymbar==4.0.1 - +sphinx-collapse +ruptures +# sphinxcontrib-pseudocode diff --git a/docs/simulations.rst b/docs/simulations.rst index 4207c082..a561dc64 100644 --- a/docs/simulations.rst +++ b/docs/simulations.rst @@ -6,6 +6,8 @@ :code:`explore_REXEE` helps the user to figure out possible combinations of REXEE parameters, while :code:`run_REXEE` and :code:`analyze_REXEE` can be used to perform and analyze REXEE simulations, respectively. Below we provide more details about each of these CLIs. +.. _doc_explore_REXEE: + 1.1. CLI :code:`explore_REXEE` ------------------------------ Here is the help message of :code:`explore_REXEE`: @@ -14,21 +16,21 @@ Here is the help message of :code:`explore_REXEE`: usage: explore_REXEE [-h] -N N [-r R] [-n N] [-s S] [-c] [-e] - This code explores the parameter space of homogenous REXEE to help you figure out all - possible combinations of the number of replicas, the number of states in each replica, - and the number of overlapping states, and the total number states. + This CLI explores the parameter space of a homogenous REXEE simulation to help you + figure out all possible combinations of the number of replicas, the number of states in + each replica, and the number of overlapping states, given the total number of states. optional arguments: -h, --help show this help message and exit -N N, --N N The total number of states of the REXEE simulation. -r R, --r R The number of replicas that compose the REXEE simulation. - -n N, --n N The number of states for each replica. + -n N, --n N The number of states per replica. -s S, --s S The state shift between adjacent replicas. - -c, --cnst Whether the apply the constraint such that the number of overlapping - states does not exceed 50% of the number of states in both overlapping - replicas. + -c, --cnst Whether to apply the constraint such that the number of overlapping + states does not exceed 50% of the number of states in adjacent + replicas. (Default: False) -e, --estimate Whether to provide estimates of the chance of not having any swappable - pairs for each solution. + pairs for each solution. (Default: False) 1.2. CLI :code:`run_REXEE` @@ -39,35 +41,35 @@ Here is the help message of :code:`run_REXEE`: usage: run_REXEE [-h] [-y YAML] [-c CKPT] [-g G_VECS] [-o OUTPUT] [-m MAXWARN] - This code runs a REXEE simulation given necessary inputs. + This CLI runs a REXEE simulation given necessary inputs. optional arguments: - -h, --help show this help message and exit - -y YAML, --yaml YAML The input YAML file that contains REXEE parameters. (Default: - params.yaml) - -c CKPT, --ckpt CKPT The NPY file containing the replica-space trajectories. This file - is a necessary checkpoint file for extending the simulaiton. - (Default: rep_trajs.npy) - -g G_VECS, --g_vecs G_VECS - The NPY file containing the timeseries of the whole-range - alchemical weights. This file is a necessary input if ones wants - to update the file when extending the simulation. (Default: - g_vecs.npy) - -o OUTPUT, --output OUTPUT - The output file for logging how replicas interact with each - other. (Default: run_REXEE_log.txt) - -m MAXWARN, --maxwarn MAXWARN - The maximum number of warnings in parameter specification to be - ignored. - -In our current implementation, it is assumed that all replicas of an REXEE simulations are performed in -parallel using MPI. Naturally, performing an REXEE simulation using :code:`run_REXEE` requires a command-line interface + -h, --help show this help message and exit + -y YAML, --yaml YAML The file path of the input YAML file that contains REXEE + parameters. (Default: params.yaml) + -c CKPT, --ckpt CKPT The file path of the NPY file containing the replica-space + trajectories. This file is a necessary checkpoint file for + extending the simulation. (Default: rep_trajs.npy) + -g G_VECS, --g_vecs G_VECS + The file path of the NPY file containing the time series of the + whole-range alchemical weights. This file is a necessary input + if one wants to update the file when extending a weight- + updating simulation. (Default: g_vecs.npy) + -o OUTPUT, --output OUTPUT + The file path of the output file for logging how replicas + interact with each other. (Default: run_REXEE_log.txt) + -m MAXWARN, --maxwarn MAXWARN + The maximum number of warnings in parameter specification to be + ignored. (Default: 0) + +In our current implementation, it is assumed that all replicas of a REXEE simulation are performed in +parallel using MPI. Naturally, performing a REXEE simulation using :code:`run_REXEE` requires a command-line interface to launch MPI processes, such as :code:`mpirun` or :code:`mpiexec`. For example, on a 128-core node -in a cluster, one may use :code:`mpirun -np 4 run_REXEE` (or :code:`mpiexec -n 4 run_REXEE`) to run an REXEE simulation composed of 4 +in a cluster, one may use :code:`mpirun -np 4 run_REXEE` (or :code:`mpiexec -n 4 run_REXEE`) to run a REXEE simulation composed of 4 replicas with 4 MPI processes. Note that in this case, it is often recommended to explicitly specify -more details about resources allocated for each replica. For example, one can specifies :code:`{'-nt': 32}` -for the REXEE parameter `runtime_args` (specified in the input YAML file, see :ref:`doc_REXEE_parameters`), -so each of the 4 replicas will use 32 threads (assuming thread-MPI GROMACS), taking the full advantage +more details about resources allocated for each replica. For example, one can specify :code:`{'-nt': 32}` +for the REXEE parameter :code:`runtime_args` in the input YAML file (see :ref:`doc_REXEE_parameters`), +so each of the 4 replicas will use 32 threads (assuming thread-MPI GROMACS), taking full advantage of 128 cores. 1.3. CLI :code:`analyze_REXEE` @@ -77,36 +79,42 @@ Finally, here is the help message of :code:`analyze_REXEE`: :: usage: analyze_REXEE [-h] [-y YAML] [-o OUTPUT] [-rt REP_TRAJS] [-st STATE_TRAJS] - [-d DIR] [-m MAXWARN] + [-sts STATE_TRAJS_FOR_SIM] [-d DIR] [-m MAXWARN] - This code analyzes a REXEE simulation. Note that the template MDP file - specified in the YAML file needs to be available in the working directory. + This CLI analyzes a REXEE simulation. optional arguments: - -h, --help show this help message and exit - -y YAML, --yaml YAML The input YAML file used to run the REXEE simulation. (Default: - params.yaml) - -o OUTPUT, --output OUTPUT - The output log file that contains the analysis results of REXEE. - (Default: analyze_REXEE_log.txt) - -rt REP_TRAJS, --rep_trajs REP_TRAJS - The NPY file containing the replica-space trajectory. (Default: - rep_trajs.npy) - -st STATE_TRAJS, --state_trajs STATE_TRAJS - The NPY file containing the stitched state-space trajectory. If - the specified file is not found, the code will try to find all - the trajectories and stitch them. (Default: state_trajs.npy) - -d DIR, --dir DIR The name of the folder for storing the analysis results. - -m MAXWARN, --maxwarn MAXWARN - The maximum number of warnings in parameter specification to be - ignored. - -2. Recommended workflow + -h, --help show this help message and exit + -y YAML, --yaml YAML The file path of the input YAML file used to run the REXEE + simulation. (Default: params.yaml) + -o OUTPUT, --output OUTPUT + The file path of the output log file that contains the analysis + results of REXEE. (Default: analyze_REXEE_log.txt) + -rt REP_TRAJS, --rep_trajs REP_TRAJS + The file path of the NPY file containing the replica-space + trajectory. (Default: rep_trajs.npy) + -st STATE_TRAJS, --state_trajs STATE_TRAJS + The file path of the NPY file containing the stitched state- + space trajectory. If the specified file is not found, the code + will try to find all the trajectories and stitch them. (Default: + state_trajs.npy) + -sts STATE_TRAJS_FOR_SIM, --state_trajs_for_sim STATE_TRAJS_FOR_SIM + The file path of the NPY file containing the stitched state- + space time series for different state sets. If the specified + file is not found, the code will try to find all the time series + and stitch them. (Default: state_trajs.npy) + -d DIR, --dir DIR The path of the folder for storing the analysis results. + (Default: analysis) + -m MAXWARN, --maxwarn MAXWARN + The maximum number of warnings in parameter specification to be + ignored. (Default: 0) + +2. Implemented workflow ======================= -In this section, we introduce the workflow adopted by the CLI :code:`run_REXEE` that can be used to +In this section, we introduce the workflow implemented in the CLI :code:`run_REXEE` that can be used to launch REXEE simulations. While this workflow is made as flexible as possible, interested users can use functions defined :class:`ReplicaExchangeEE` to develop their own workflow, or consider contributing -to the source code of the CLI :code:`run_REXEE`. As an example, a hands-on tutorial that uses this workflow (using the CLI :code:`run_REXEE`) can be found in +to the source code of the CLI :code:`run_REXEE`. As an example, a hands-on tutorial that uses the CLI :code:`run_REXEE` can be found in `Tutorial 1: Launching a REXEE simulation`_. .. _`Tutorial 1: Launching a REXEE simulation`: examples/run_REXEE.ipynb @@ -114,116 +122,124 @@ to the source code of the CLI :code:`run_REXEE`. As an example, a hands-on tutor Step 1: Set up parameters ------------------------- -To run a REXEE simulation in GROMACS using :code:`run_REXEE.py`, one at -least needs to following four files: - -* One GRO file of the system of interest -* One TOP file of the system of interest -* One MDP template for customizing different MDP files for different replicas. -* One YAML file that specify the REXEE-relevant parameters. - -Currently, we only allow all replicas to be initiated with the same configuration represented -by the single GRO file, but the user should also be able to initialize different replicas with different -configurations (represented by multiple GRO files) in the near future. Also, the MDP template should contain parameters -common across all replicas and define the coupling parmaeters for all possible intermediate states, -so that we can cusotmize different MDP files by defining a subset of alchemical states in different -replicas. For REXEE simulations, some MDP parameters need additional care to be taken, which we describe in -:ref:`doc_mdp_params`. Importantly, to extend an REXEE simulation, one needs to additionally provide the following -two checkpoint files: - -* One NPY file containing the replica-space trajectories of different configurations saved by the previous run of REXEE simulation with a default name as :code:`rep_trajs.npy`. -* One NPY file containing the timeseries of the whole-range alchemical weights saved by the previous run of REXEE simulation with a default name as :code:`g_vecs.npy`. - -In :code:`run_REXEE.py`, the class :class:`.ReplicaExchangeEE` is instantiated with the given YAML file, where +To run a REXEE simulation in GROMACS using the CLI :code:`run_REXEE`, one at +least needs the following four files. (Check :ref:`doc_input_files` for more details.) + +* One YAML file that specifies REXEE parameters, as specified via the CLI :code:`run_REXEE`. +* One GRO file of the system of interest, as specified in the input YAML file. +* One TOP file of the system of interest, as specified in the input YAML file. +* One MDP template for customizing MDP files for different replicas, as specified in the input YAML file. + +Note that multiple GRO/TOP files can be provided to initiate different replicas with different configurations/topologies, +in which case the number of GRO/TOP files must be equal to the number of replicas. +Also, the MDP template should contain parameters shared by all replicas and define the coupling parameters for all +intermediate states. Moreover, additional care needs to be taken for specifying some MDP parameters need additional care to be taken, which we describe in +:ref:`doc_mdp_params`. Lastly, to extend a REXEE simulation, one needs to additionally provide the following +two files (generated by the existing simulation) as necessary checkpoints: + +* One NPY file containing the replica-space trajectories of different configurations, as specified in the input YAML file. +* One NPY file containing the time series of the whole-range alchemical weights, as specified in the input YAML file. This is only needed for extending a weight-updating REXEE simulation. + +In the CLI :code:`run_REXEE`, the class :class:`.ReplicaExchangeEE` is instantiated with the given YAML file, where the user needs to specify how the replicas should be set up or interact with each other during the simulation ensemble. Check :ref:`doc_parameters` for more details. Step 2: Run the 1st iteration ----------------------------- -With all the input files/parameters set up in the previous run, one can use run the first iteration, -using :obj:`.run_REXEE`, which uses :code:`subprocess.run` to launch GROMACS :code:`grompp` -and :code:`mdrun` commands in parallel. +After setting things up in the previous step, the CLI :code:`run_REXEE` uses the function :obj:`.run_REXEE` to run subprocess calls to +launch GROMACS :code:`grompp` and :code:`mdrun` commands in parallel for the first iteration. Step 3: Set up the new iteration -------------------------------- -In general, this step can be further divided into the following substeps. +In the CLI :code:`run_REXEE`, this step can be further divided into the following substeps. Step 3-1: Extract the final status of the previous iteration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To calculate the acceptance ratio and modify the mdp files in later steps, we first need to extract the information +To calculate the acceptance ratio and modify the MDP files in later steps, we first need to extract the information of the final status of the previous iteration. Specifically, for all the replica simulations, we need to -* Find the last sampled state and the corresponding lambda values from the DHDL files -* Find the final Wang-Landau incrementors and weights from the LOG files. +* Find the last sampled state and the corresponding lambda values from the DHDL files, which are necessary for both fixed-weight and weight-updating simulations. +* Find the final Wang-Landau incrementors and weights from the LOG files, which are necessary for a weight-updating simulation. These two tasks are done by :obj:`.extract_final_dhdl_info` and :obj:`.extract_final_log_info`. .. _doc_swap_basics: -Step 3-2: Identify swappable pairs and propose simulation swap(s) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -After the information of the final status of the previous iteration is extracted, we then identify swappable pairs. -Specifically, replicas can be swapped only if the states to be swapped are present in both of the alchemical ranges -corresponding to the two replicas. This definition automatically implies one necessary but not sufficient condition that -the replicas to be swapped should have overlapping alchemical ranges. Practically, if the states to be swapped are -not present in both alchemical ranges, information like :math:`\Delta U^i=U^i_n-U^j_m` will not be available -in either DHDL files and terms like :math:`\Delta g^i=g^i_n-g^i_m` cannot be calculated from the LOG files as well, which -makes the calculation of the acceptance ratio technicaly impossible. (For more details about the acceptance ratio is calculated -in different schemes for swapping, check the section :ref:`doc_acceptance`.) After the swappable pairs are identified, -the user can propose swap(s) using :obj:`propose_swaps`. Swap(s) will be proposed given the specified proposal scheme (see -more details about available proposal schemes in :ref:`doc_proposal`). - -Step 3-3: Decide whether to reject/accept the swap(s) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -This step is mainly done by :obj:`.get_swapped_configs`, which calls functions :obj:`.calc_prob_acc` and :obj:`.accept_or_reject`. -The former calculates the acceptance ratio from the DHDL/LOG files of the swapping replicas, while the latter draws a random number -and compare with the acceptance ratio to decide whether the proposed swap should be accepted or not. If mutiple swaps are wanted, -in :obj:`.get_swapped_configs`, the acceptance ratio of each swap will be evaluated so to decide whether the swap should be accepted -or not. Based on this :obj:`get_swapped_configs` returns a list of indices that represents the final configurations after all the swaps. - -Step 3-4: Combine the weights if needed +Step 3-2: Identify the swapping pattern ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For the states that are present in the alchemical ranges of multiple replicas, it is likely that they are -sampled more frequenly overall. To leverage the fact that we collect more statistics for these states, it is recoomended -that the weights of these states be combined across all replicas that sampled these states. This task can be completed by -:obj:`combine_wieghts`, with the desired method specified in the input YAML file. For more details about different -methods for combining weights across different replicas, please refer to the section :ref:`doc_w_schemes`. - -Step 3-5: Modify the MDP files and swap out the GRO files (if needed) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -After the final configuration has been figured out by :obj:`get_swapped_configs` (and weights have bee combined by :obj:`combine_weights` -when needed), the user should set up the input files for the next iteration. In principle, the new iteration should inherit the final +Given the information of the final status of the previous simulation, the CLI :code:`run_REXEE` runs the function :obj:`.get_swapping_pattern` to figure out how the coordinates should be swapped between replicas. +Specifically, the function does the following: + +- Identify swappable pairs using the function :obj:`.identify_swappable_pairs`. Notably, replicas can be + swapped only if the states to be swapped are present in both of the state sets + corresponding to the two replicas. This definition automatically implies one necessary but not sufficient condition that + the replicas to be swapped should have overlapping state sets. Practically, if the states to be swapped are + not present in both state sets, potential energy differences required for the calculation of :math:`\Delta` + will not be available, which makes the calculation of the acceptance ratio technically impossible. +- Propose a swap using the function :obj:`.propose_swap`. +- Calculates the acceptance ratio using :math:`\Delta u` values + obtained from the DHDL files using the function :obj:`.calc_prob_acc`. +- Use the function :obj:`.accept_or_reject` to draw a random number and compare it with the acceptance ratio + to decide whether the swap should be accepted or not. +- Propose and evaluate multiple swaps if needed (e.g., when the exhaustive exchange proposal scheme is used), and finally return a list + that represents how the configurations should be swapped in the next iteration. + +For more details, please refer to the API documentation of the involved functions. + +Step 3-3: Apply correction schemes if needed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +For a weight-updating REXEE simulation, correction schemes may be applied if specified. Specifically, +the CLI :code:`run_REXEE` applies the weight combination scheme using the function :obj:`.combine_weights` +and the histogram correction scheme using the function :obj:`.histogram_correction`. +For more details about correction schemes, please refer to the section :ref:`doc_correction`. + +Step 3-4: Set up the input files for the next iteration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +After the final configuration has been figured out by :obj:`.get_swapping_pattern` (and the weights/counts have been adjusted by the specified correction schemes, if any), +the CLI :code:`run_REXEE` sets up input files for the next iteration. In principle, the new iteration should inherit the final status of the previous iteration. This means: -* For each replica, the input configuration for initializing a new iterations should be the output configuraiton of the previous iteration. For example, if the final configurations are represented by :code:`[1, 2, 0, 3]` (returned by :obj:`.get_swapped_configs`), then in the next iteration, replica 0 should be initialized by the output configuration of replica 1 in the previous iteration, while replica 3 can just inherit the output configuration from previous iteration of the same replica. Notably, instead of exchanging the MDP files, we recommend swapping out the coordinate files to exchange replicas. -* For each replica, the MDP file for the new iteration should be the same as the one used in the previous iteartion of the same replica except that parameters like :code:`tinit`, :code:`init_lambda_state`, :code:`init_wl_delta`, and :code:`init_lambda_weights` should be modified to the final values in the previous iteration. This can be done by :class:`.gmx_parser.MDP` and :obj:`.update_MDP`. +* For each replica, the input configuration for initializing a new iteration should be the output configuration of the previous iteration. For example, + if the final configurations are represented by :code:`[1, 2, 0, 3]` (returned by :obj:`.get_swapping_pattern`), then in the next iteration, replica 0 + should be initialized by the output configuration of replica 1 in the previous iteration, while replica 3 can just inherit the output configuration from the + previous iteration of the same replica. Notably, instead of exchanging the MDP files, the CLI :code:`run_REXEE` swaps out the coordinate files to exchange + replicas, which is equivalent to exchanging the MDP files. +* For each replica, the MDP file for the new iteration should be the same as the one used in the previous iteration of the same replica except that parameters + like :code:`tinit`, :code:`init_lambda_state`, :code:`init_wl_delta`, and :code:`init_lambda_weights` should be modified to the final values in the previous + iteration. In the CLI :code:`run_REXEE`, this is done by :obj:`.update_MDP`. Step 4: Run the new iteration ----------------------------- After the input files for a new iteration have been set up, we use the procedure in Step 2 to -run a new iteration. Then, the user should loop between Steps 3 and 4 until the desired number of -iterations (:code:`n_iterations`) is reached. +run a new iteration. Then, the CLI :code:`run_REXEE` loops between Steps 3 and 4 until the desired number of +iterations (:code:`n_iterations` specified in the input YAML file) is reached. .. _doc_parameters: 3. Input YAML parameters ======================== -In the current implementation of the algorithm, 28 parameters can be specified in the input YAML file. +In the current implementation of the algorithm, 30 parameters can be specified in the input YAML file. Note that the two CLIs :code:`run_REXEE` and :code:`analyze_REXEE` share the same input YAML file, so we also include parameters for data analysis here. 3.1. GROMACS executable ----------------------- - - :code:`gmx_executable`: (Optional, Default: :code:`gmx_mpi`) + - :code:`gmx_executable`: (Optional, Default: :code:`'gmx_mpi'`) The GROMACS executable to be used to run the REXEE simulation. The value could be as simple as :code:`gmx` - or :code:`gmx_mpi` if the exeutable has been sourced. Otherwise, the full path of the executable (e.g. + or :code:`gmx_mpi` if the executable has been sourced. Otherwise, the full path of the executable (e.g., :code:`/usr/local/gromacs/bin/gmx`, the path returned by the command :code:`which gmx`) should be used. - Note that REXEE only works with MPI-enabled GROMACS. + Currently, our implementation only works with thread-MPI GROMACS. An implementation that works with MPI-enabled + GROMACS will be released soon. (Check `Issue 20`_ for the current progress.) -3.2. Input settings -------------------- +.. _`Issue 20`: https://github.com/wehs7661/ensemble_md/issues/20 + + +.. _doc_input_files: + +3.2. Input files +---------------- - :code:`gro`: (Required) The path of the input system configuration in the form of GRO file(s) used to initiate the REXEE simulation. If only one GRO file is specified, @@ -236,24 +252,24 @@ include parameters for data analysis here. the i-th TOP file corresponds to the i-th GRO file. - :code:`mdp`: (Required) The path of the input MDP file used to initiate the REXEE simulation. Specifically, this input MDP file will serve as a template for - customizing MDP files for all replicas. Therefore, the MDP template must have the whole range of :math:`λ` values. - and the corresponding weights (in fixed-weight simulations). This holds for REXEE simulations for multiple serial mutations as well. - For example, in an REXEE simulation that mutates methane to ethane in one replica and ethane to propane in the other replica, if + customizing MDP files for all replicas. Therefore, the MDP template must specify the whole range of :math:`λ` values + and :math:`λ`-relevant parameters. This holds for REXEE simulations for multiple serial mutations as well. + For example, in a REXEE simulation that mutates methane to ethane in one replica and ethane to propane in the other replica, if exchanges only occur in the end states, then one could have :math:`λ` values like :code:`0.0 0.3 0.7 1.0 0.0 0.3 ...`. Notably, unlike the parameters :code:`gro` and :code:`top`, only one MDP file can be specified for the parameter :code:`mdp`. If you wish to use different parameters for different replicas, please use the parameter :code:`mdp_args`. - :code:`modify_coords`: (Optional, Default: :code:`None`) - The file path to the Python module for modifying the output coordinates of the swapping replicas - before the coordinate exchange, which is generally required in REXEE simulations for multiple serial mutations. + The file path of the Python module for modifying the output coordinates of the swapping replicas + before the coordinate exchange, which is generally required in multi-topology REXEE simulations. For the CLI :code:`run_REXEE` to work, here is the predefined contract for the module/function based on the assumptions :code:`run_REXEE` makes. Modules/functions not obeying the contract are unlikely to work. - Multiple functions can be defined in the module, but the function for coordinate manipulation must have the same name as the module itself. - The function must only have two compulsory arguments, which are the two GRO files to be modified. The function must not depend on the order of the input GRO files. - The function must return :code:`None` (i.e., no return value). - - The function must save the modified GRO file as :code:`confout.gro`. Specifically, if :code:`directory_A/output.gro` and :code:`directory_B/output.gro` are input, then :code:`directory_A/confout.gro` and :code:`directory_B/confout.gro` must be saved. (For more information, please visit `Tutorial 3: REXEE for multiple serial mutations`_.) Note that in the CLI :code:`run_REXEE`, :code:`confout.gro` generated as the simulation output will be automatically backed up (with a :code:`_backup` suffix) to prevent overwriting. + - The function must save the modified GRO file as :code:`confout.gro`. Specifically, if :code:`directory_A/output.gro` and :code:`directory_B/output.gro` are input, then :code:`directory_A/confout.gro` and :code:`directory_B/confout.gro` must be saved. (For more information, please visit `Tutorial 3: Multi-topology REXEE (MT-REXEE) simulations`_.) Note that in the CLI :code:`run_REXEE`, :code:`confout.gro` generated by GROMACS will be automatically renamed with a :code:`_backup` suffix to prevent overwriting. -.. _`Tutorial 3: REXEE for multiple serial mutations`: examples/run_REXEE_modify_inputs.ipynb +.. _`Tutorial 3: Multi-topology REXEE (MT-REXEE) simulations`: examples/run_REXEE_modify_inputs.ipynb .. _doc_REXEE_parameters: @@ -263,57 +279,60 @@ include parameters for data analysis here. - :code:`n_sim`: (Required) The number of replica simulations. - :code:`n_iter`: (Required) - The number of iterations. In an REXEE simulation, one iteration means one exchange attempt. Notably, this can be used to extend the REXEE simulation. - For example, if one finishes an REXEE simulation with 10 iterations (with :code:`n_iter=10`) and wants to continue the simulation from iteration 11 to 30, - setting :code:`n_iter` in the next execution of :code:`run_REXEE` should suffice. + The number of iterations. In a REXEE simulation, one iteration means one exchange interval, which can involve multiple proposed swaps + (if the exhaustive exchange proposal scheme is used). Note that when extending a simulation is desired and the necessary checkpoint files are provided, + this parameter takes into account the number of iterations that have already been performed. That is, if a simulation has already been performed for 100 iterations, + and one wants to extend it for 50 more iterations, then the value of this parameter should be 150. - :code:`s`: (Required) - The shift in the alchemical ranges between adjacent replicas (e.g. :math:`s = 2` if :math:`λ_2 = (2, 3, 4)` and :math:`λ_3 = (4, 5, 6)`. + The shift in the state sets between adjacent replicas. For example, if replica 1 samples states 0, 1, 2, 3 and replica 2 samples + states 2, 3, 4, 5, then :code:`s = 2` should be specified. - :code:`nst_sim`: (Optional, Default: :code:`nsteps` in the template MDP file) - The number of simulation steps to carry out for one iteration, i.e. stpes between exchanges proposed between replicas. The value specified here will - overwrite the :code:`nsteps` parameter in the MDP file of each iteration. This option also assumes replicas with homogeneous simulation lengths. + The exchange period, i.e., the number of simulation steps to carry out for one iteration. The value specified here will + overwrite the :code:`nsteps` parameter in the MDP file of each iteration. Note that this option assumes replicas with homogeneous simulation lengths. - :code:`add_swappables`: (Optional, Default: :code:`None`) A list of lists that additionally consider states (in global indices) that can be swapped. For example, :code:`add_swappables=[[4, 5], [14, 15]]` means that if a replica samples state 4, it can be swapped with another replica that samples state 5 and vice versa. The same logic applies to states 14 and 15. - This could be useful for REXEE simulations for multiple serial mutations, where we enforce exchanges between states 4 and 5 (and 14 and 15) and perform - coordinate manipulation. - - :code:`proposal`: (Optional, Default: :code:`exhaustive`) + This could be useful for multi-topology REXEE (MT-REXEE) simulations, where we enforce the consideration of exchanges between states 4 and 5 (and 14 and 15) and perform + coordinate manipulation when necessary. + - :code:`proposal`: (Optional, Default: :code:`'exhaustive'`) The method for proposing simulations to be swapped. Available options include :code:`single`, :code:`neighboring`, and :code:`exhaustive`. For more details, please refer to :ref:`doc_proposal`. - :code:`w_combine`: (Optional, Default: :code:`False`) - Whether to perform weight combination or not. Note that weights averaged over from the last time the Wang-Landau incrementor was updated (instead of - final weights) will be used for weight combination. For more details about weight combination, please refer to :ref:`doc_w_schemes`. - - :code:`w_mean_type`: (Optional, Default: code:`simple`) + Whether to perform weight combination or not. Note that weights averaged over from the last update of the Wang-Landau incrementor (instead of the + final weights) will be used for weight combination. For more details about, please refer to :ref:`doc_w_schemes`. + - :code:`w_mean_type`: (Optional, Default: :code:`'simple'`) The type of mean to use when combining weights. Available options include :code:`simple` and :code:`weighted`. - For the later case, inverse-variance weighted means are used. + For the latter case, inverse-variance weighted means are used. For more details about, please refer to :ref:`doc_w_schemes`. - :code:`N_cutoff`: (Optional, Default: 1000) - The histogram cutoff for weight corrections. -1 means that no histogram correction will be performed. + The histogram cutoff for weight corrections. A cutoff of 1000 means that weight corrections will be applied only if + the counts of the involved states are both larger than 1000. A value of -1 means that no weight correction will be performed. + For more details, please please refer to :ref:`doc_weight_correction`. - :code:`hist_corr` (Optional, Default: :code:`False`) - Whether to perform histogram correction. + Whether to perform histogram correction. For more details, please refer to :ref:`doc_hist_correction`. - :code:`mdp_args`: (Optional, Default: :code:`None`) - MDP parameters differing across replicas provided in a dictionary. For each key in the dictionary, the value should - always be a list of length of the number of replicas. For example, :code:`{'ref_p': [1.0, 1.01, 1.02, 1.03]}` means that the + A dictionary that contains MDP parameters differing across replicas. For each key in the dictionary, the value should + always be a list of the length of the number of replicas. For example, :code:`{'ref_p': [1.0, 1.01, 1.02, 1.03]}` means that the MDP parameter :code:`ref_p` will be set as 1.0 bar, 1.01 bar, 1.02 bar, and 1.03 bar for replicas 0, 1, 2, and 3, respectively. Note that while this feature allows high flexibility in parameter specification, not all parameters are suitable to be - varied across replicas. For example, varying :code:`nsteps` across replicas for synchronous REXEE simulations does not make sense. - Additionally, this feature is a work in progress and differing :code:`ref_t` or :code:`dt` across replicas might cause issues. + varied across replicas. Users should use this parameter with caution, as there is no check for the validity of the MDP parameters. + Additionally, this feature is a work in progress and differing :code:`ref_t` or :code:`dt` across replicas would not work. - :code:`grompp_args`: (Optional: Default: :code:`None`) - Additional arguments to be appended to the GROMACS :code:`grompp` command provided in a dictionary. + A dictionary that contains additional arguments to be appended to the GROMACS :code:`grompp` command. For example, one could have :code:`{'-maxwarn', '1'}` to specify the :code:`maxwarn` argument for the :code:`grompp` command. - :code:`runtime_args`: (Optional, Default: :code:`None`) - Additional runtime arguments to be appended to the GROMACS :code:`mdrun` command provided in a dictionary. + A dictionary that contains additional runtime arguments to be appended to the GROMACS :code:`mdrun` command. For example, one could have :code:`{'-nt': 16}` to run the simulation using tMPI-enabled GROMACS with 16 threads. - Notably, if MPI-enabled GROMACS is used, one should specify :code:`-np` to better use the resources. If it is - not specified, the default will be the number of simulations and a warning will occur. 3.4. Output settings -------------------- - :code:`verbose`: (Optional, Default: :code:`True`) - Whether a verbse log is wanted. + Whether a verbose log file is desired. - :code:`n_ckpt`: (Optional, Default: 100) - The frequency for checkpointing in the number of iterations. + The number of iterations between each checkpoint. Specifically, the CLI :code:`run_REXEE` will save the replica-space trajectories + and the time series of the whole-range alchemical weights (in a weight-updating simulation) every :code:`n_ckpt` iterations. This is useful for extending a simulation. - :code:`rm_cpt`: (Optional, Default: :code:`True`) Whether the GROMACS checkpoint file (:code:`state.cpt`) from each iteration should be deleted. - Normally we don't need CPT files for REXEE simulations (even for extension) so we recommend just + Normally we don't need GROMACS CPT files for REXEE simulations (even for extension) so we recommend just deleting the CPT files (which could save a lot of space if you perform a huge number of iterations). If you wish to keep them, specify this parameter as :code:`False`. @@ -324,35 +343,32 @@ include parameters for data analysis here. - :code:`msm`: (Optional, Default: :code:`False`) Whether to build Markov state models (MSMs) for the REXEE simulation and perform relevant analysis. - :code:`free_energy`: (Optional, Default: :code:`False`) - Whether to perform free energy calculations in data analysis or not. Note that free energy calculations - could be computationally expensive depending on the relevant settings. + Whether to perform free energy calculations or not. - :code:`subsampling_avg`: (Optional, Default: :code:`False`) Whether to take the arithmetic average of the truncation fractions and the geometric average of the statistical inefficiencies over replicas when subsampling data for free energy calculations. For systems where the sampling is challenging, the truncation fraction or statistical inefficiency may vary largely - across alchemical ranges, in which case this option could be useful. + across state sets, in which case this option could be useful. - :code:`df_spacing`: (Optional, Default: 1) - The step to used in subsampling the DHDL data in free energy calculations. + The spacing (in the number of data points) to consider when subsampling the data, which is assumed to + be the same for all replicas. - :code:`df_ref`: (Optional, Default: :code:`None`) - The reference free energy profile for the whole range of states input as a list having the length of the number of states. - - :code:`df_method`: (Optional, Default: :code:`MBAR`) - The free energy estimator to use in free energy calcuulation. Available options include :code:`TI`, :code:`BAR`, and :code:`MBAR`. - - :code:`err_method`: (Optional, Default: :code:`propagate`) + The reference free energy profile for the whole range of states. The input should be a list having the length of the total number of states. + - :code:`df_method`: (Optional, Default: :code:`'MBAR'`) + The free energy estimator to use in free energy calculations. Available options include :code:`'TI'`, :code:`'BAR'`, and :code:`'MBAR'`. + - :code:`err_method`: (Optional, Default: :code:`'propagate'`) The method for estimating the uncertainty of the free energy combined across multiple replicas. - Available options include :code:`propagate` and :code:`bootstrap`. The boostrapping method is more accurate but much more + Available options include :code:`'propagate'` and :code:`'bootstrap'`. The bootstrapping method is more accurate but much more computationally expensive than simple error propagation. - :code:`n_bootstrap`: (Optional, Default: 50) - The number of bootstrap iterations to perform when estimating the uncertainties of the free energy differences between - overlapping states. - - :code:`seed`: (Optional, Default: None) + The number of bootstrap iterations to perform when estimating the uncertainties of the free energy differences. + - :code:`seed`: (Optional, Default: :code:`None`) The random seed to use in bootstrapping. 3.6. A template input YAML file ------------------------------- For convenience, here is a template of the input YAML file, with each optional parameter specified with the default and required -parameters left with a blank. Note that specifying :code:`null` is the same as leaving the parameter unspecified (i.e. :code:`None`). -Note that the default value :code:`None` for the parameter :code:`rmse_cutoff` will be converted to -infinity internally. +parameters left blank. Note that specifying :code:`null` is the same as leaving the parameter unspecified (i.e., :code:`None`). .. code-block:: yaml @@ -383,15 +399,16 @@ infinity internally. # Section 4: Output settings verbose: True n_ckpt: 100 - rm_cpt: False + rm_cpt: True # Section 5: Data analysis msm: False free_energy: False + subsampling_avg: False df_spacing: 1 df_ref: null - df_method: "MBAR" - err_method: "propagate" + df_method: 'MBAR' + err_method: 'propagate' n_bootstrap: 50 seed : null @@ -399,18 +416,18 @@ infinity internally. 4. Input MDP parameters ======================= -As mentioned above, a template MDP file should have all the parameters that will be shared +As mentioned above, a template MDP file should have all the parameters commonly shared across all replicas. It should also define the coupling parameters for the whole range of -states so that different MDP files can be customized for different replicas. For an REXEE simulation +states so that different MDP files can be customized for different replicas. For a REXEE simulation launched by the CLI :code:`run_REXEE`, any GROMACS MDP parameter that could potentially lead to issues in the REXEE simulation will raise a warning. If the number of warnings is larger than the value -specified for the flag `-m`/`--maxwarn` in the CLI :code:`run_REXEE`, the simulation will error -out. To avoid warnings arised from MDP specification, we need to take extra care for the following +specified for the flag :code:`-m`/:code:`--maxwarn` in the CLI :code:`run_REXEE`, the simulation will error +out. To avoid warnings arisen from MDP specification, users need to take extra care for the following MDP parameters: - We recommend setting :code:`lmc_seed = -1` so that a different random seed for Monte Carlo moves in the state space will be used for each iteration. -- We recommend setting :code:`gen_vel = yes` to re-generating new velocities for each iteration to avoid +- We recommend setting :code:`gen_vel = yes` to re-generate new velocities for each iteration to avoid potential issues with detailed balance. - We recommend setting :code:`gen_seed = -1` so that a different random seed for velocity generation will be used for each iteration. @@ -421,9 +438,9 @@ MDP parameters: - In REXEE, the MDP parameter :code:`nstdhdl` must be a factor of the MDP parameter :code:`nstexpanded`, or the calculation of the acceptance ratio may be wrong. - Be careful with the pull code specification if you want to apply a distance restraint between two pull groups. - Specifically, in an REXEE simulation, all iterations should use the same reference distance. Otherwise, poor sampling + Specifically, in a REXEE simulation, all iterations should use the same reference distance. Otherwise, poor sampling can be observed in a fixed-weight REXEE simulation and the equilibration time may be much longer for a weight-updating - REXEE simulation. To ensure the same reference distance across all iterations in an REXEE simulation, consider the + REXEE simulation. To ensure the same reference distance across all iterations in a REXEE simulation, consider the following scenarios: - If you would like to use the COM distance between the pull groups in the input GRO file as the reference distance @@ -434,3 +451,65 @@ MDP parameters: the MDP parameter :code:`pull_nstxout` should not be 0. - If you want to explicitly specify a reference distance (:code:`d`) to use for all iterations, simply use :code:`pull_coord1_start = no` with :code:`pull_coord1_init = d` in your input MDP template. + +5. Some rules of thumb +====================== +Here are some rules of thumb for specifying some key YAML parameters, as discussed/concluded from our paper [Hsu2024]_. + +- **Number of replicas** (:code:`n_sim`): Just like other replica exchange methods, it is generally recommended that the number of replicas be + a factor of available computational resources, such as the number of CPU cores. For example, if you have 128 CPU cores, you may + consider using 4, 8, 16, or 32 replicas. +- **Total number of states**: One advantage of the REXEE method over other replica exchange methods is that it completely + decouples the number of replicas from the number of states. Therefore, once the number of replicas is decided, the total number of states + can be arbitrary. Still, just like other generalized ensemble methods, the total number of states should be large enough + to ensure sufficient overlap between adjacent states, but not too large to make the simulation computationally expensive. +- **Number of states per replica**/**State shift**: After deciding the total number of states and the number of replicas, one can use the CLI + :code:`explore_REXEE` to list all possible REXEE configurations, from which one can decide the number of states per replica or the state shift + depending on how much overlap is desired between adjacent replicas. For example, if one has decided to use 4 replicas to sample 12 + alchemical intermediate states, one can run :code:`explore_REXEE -N 12 -r 4`, which returns the following: + + .. code-block:: + + Exploration of the REXEE parameter space + ======================================= + [ REXEE parameters of interest ] + - N: The total number of states + - r: The number of replicas + - n: The number of states for each replica + - s: The state shift between adjacent replicas + + [ Solutions ] + - Solution 1: (N, r, n, s) = (12, 4, 6, 2) + - Replica 0: [0, 1, 2, 3, 4, 5] + - Replica 1: [2, 3, 4, 5, 6, 7] + - Replica 2: [4, 5, 6, 7, 8, 9] + - Replica 3: [6, 7, 8, 9, 10, 11] + + - Solution 2: (N, r, n, s) = (12, 4, 9, 1) + - Replica 0: [0, 1, 2, 3, 4, 5, 6, 7, 8] + - Replica 1: [1, 2, 3, 4, 5, 6, 7, 8, 9] + - Replica 2: [2, 3, 4, 5, 6, 7, 8, 9, 10] + - Replica 3: [3, 4, 5, 6, 7, 8, 9, 10, 11] + + Generally, a higher overlap would lead to a higher sampling efficiency in both replica-space and state-space sampling, + but will also lead to a higher computational cost. However, the influence of different overlaps on the accuracy of + free energy calculations in a fixed-weight REXEE simulation is usually negligible. If one decides to use "Solution 1" in the above example, + then the YAML parameters :code:`n_sim` and :code:`s` should be set as 4, and 2, respectively. Note that the total number of + states should be reflected in the input MDP templated (specified through the YAML parameter :code:`mdp`) and the number of states + per replica will be automatically calculated by the CLI :code:`run_REXEE` (given the other three REXEE configurational parameters) + when customizing MDP files for different replicas. + +- **Exchange period** (:code:`nst_sim`): Generally, a higher swapping frequency (i.e., lower exchange period) would lead to a higher + sampling efficiency in both replica-space and state-space sampling, as well as higher accuracy in free energy calculations. However, + it would also lead to a higher computational cost. According to our experience, an exchange period between 500 to 2000 steps is + usually a good choice for most systems. + +- **Number of iterations**: After deciding the exchange period, the number of iterations should be decided only based on the desired effective + simulation length. For example, for a REXEE simulation running 4 replicas with an exchange period of 1000 steps (or 2 ps given a 2 fs time step), + one may consider running 12500 iterations to achieve a total simulation length of 100 ns. + +- **Correction schemes**: As discussed in our paper, for a weight-updating REXEE simulation, there has been no evidence showing any advantages of + using any implemented correction schemes, including weight combination, weight correction, and histogram correction schemes, which can be + enabled by YAML parameters :code:`w_combine`, :code:`N_cutoff`, and :code:`hist_corr`, respectively. To converge alchemical weights, we recommend + just using weight-updating EE simulations, or weight-updating REXEE simulations without any correction schemes, i.e., using default values for + these parameters. diff --git a/docs/theory.rst b/docs/theory.rst index da369a7b..3d8714b5 100644 --- a/docs/theory.rst +++ b/docs/theory.rst @@ -1,534 +1,452 @@ -.. note:: This page is still a work in progress. Please check `Issue 33`_ for the current progress. - -.. _`Issue 33`: https://github.com/wehs7661/ensemble_md/issues/33 - .. _doc_basic_idea: 1. Basic idea ============= -Replica exchange of expanded ensemble (REXEE) integrates the core principles of replica exchange (REX) -and expanded ensemble (EE) methods. Specifically, a REXEE simulation includes multiple +Replica exchange of expanded ensembles (REXEE) [Hsu2024]_ integrates the core principles of replica exchange (REX) +and expanded ensemble (EE) methods. Specifically, a REXEE simulation performs multiple replicas of EE simulations in parallel and periodically exchanges coordinates -between replicas. Each replica samples a different but overlapping range of alchemical -intermediate states to collectively sample the space bwteen the coupled (:math:`\lambda=0`) -and decoupled states (:math:`\lambda=1`). Naturally, the REXEE method decorrelates -the number of replicas from the number of states, allowing sampling a large number of intermediate -states with significantly fewer replicas than those required in the Hamiltonian replica exchange (HREX) -and other similar methods. By parallelizing replicas, the REXEE method also reduces +between replicas. Each replica samples a different but overlapping set of alchemical +intermediate states to collectively sample the space between the fully coupled (:math:`\lambda=0`) +and decoupled states (:math:`\lambda=1`). By design, the REXEE method decorrelates +the number of replicas from the number of states, enhancing the flexibility in replica configuration and +allowing a large number of intermediate states to be sampled with significantly fewer replicas than those +required in the Hamiltonian replica exchange (HREX) method. By parallelizing replicas, the REXEE method also reduces the simulation wall time compared to the EE method. More importantly, such parallelism sets the -stage for wider applications, such as relative free energy calculations for multi-topology transformations. - -Mathematically, we first consider a simulation ensemble that consists of :math:`M` non-interacting replicas -of the expanded ensemble simulations all at the same constant temperature :math:`T`. We assume -that the expanded ensembles collectively sample :math:`N` (:math:`N < M`) alchemical states with -labels :math:`m=1, 2, ..., N` and each replica sampling :math:`n_i` states starting from state -:math:`k` to state :math:`k + n_i -1`, which correspond to :math:`\lambda` vectors :math:`\lambda_k`, -:math:`\lambda_{k+1}`, ..., and :math:`\lambda_{k+n_i-1}`, repsectively. Mathematically, we can -define a non-injective but surjective function :math:`f` that maps the labels for replicas -(:math:`i`) to labels for :math:`\lambda` vectors (:math:`m`): - -.. math:: - m=m(i) \equiv f(i) - -This essentially assumes that the discrete domain :math:`\left \{k, k+1, ..., k+n_i-1 \right \}` -is always a subset of :math:`\mathcal{N} = \left \{1, 2, ..., N \right \}`. Notably, this is -different from Hamiltonian replica exchange molecular dynamics (HREMD) or temperature replica exchange -molecular dynamics (TREMD), where the mapping function should always be bijective (i.e. injective and -surjective) such that :math:`i=i(m) \equiv f(m)` and :math:`m=m(i) \equiv f^{-1}(i)`. That is, in HREMD -and TREMD, the number of alchemical states/temperatures is always the same as the number of replicas -(surjective) and there is always a one-to-one correpsondence between the two (injective). Physically, -this means that while in HREMD and TREMD, exchanging a pair of replicas is equivalent to exchanging -a pair of temperatures :math:`\lambda` vectors, we don't regard exchanging replicas as exchanging :math:`\lambda` -vectors because it is a many-to-one correpsondence instead of one-to-one correspondence. - -Now, we can write the "reduced Hamiltonian" of the :math:`i`-th expanded ensemble as - -.. math:: - H(p^{i}, q^{i}, \lambda_{m}^{i}) = k(p^{i}) + u(q^{i}, \lambda_{m}^{i}) - -where the reduced kinetic energy :math:`k(p^{i})` and reduced potential -:math:`u(q^{i}`, :math:`\lambda_{m}^{i})` can be expressed as - -.. math:: - \begin{cases} - k(p^i) = \beta K(p^{i}) \\ - u(q^i, \lambda^{i}_{m}) = \beta U(q^{i}, \lambda^{i}_{m}) - g^{i}_{m}\\ - \end{cases} - -with :math:`\beta=1/kT` being the inverse temperature and :math:`g^{i}_{m}` being the weighting factor -applied to the :math:`m`-th alchemical state (:math:`m \in \mathcal{N}`) of -the :math:`i`-th replica. As such, the probability of the state being sampled (say, state :math:`m`) -can be written as: +stage for more complicated applications, especially one-shot free energy calculations that involve multiple +topologies, such as serial mutations or scaffold-hopping transformations. + +In the following sections, we will briefly cover the theory behind the REXEE method, from its configuration, state +transitions, proposal schemes, weight combination schemes, to data analysis. For more details, please refer to our +paper [Hsu2024]_. + +.. figure:: _static/REXEE_illustration.png + :name: Fig. 1 + :width: 800 + :align: center + :figclass: align-center + + **Figure 1.** Schematic representation of the REXEE method, with the four configurational parameters annotated. In a REXEE simulation, the coordinates of replicas + of EE simulations are periodically exchanged. :math:`{\bf A}_1`, :math:`{\bf A}_2`, :math:`{\bf A}_3`, and :math:`{\bf A}_4` + denote the sets of states different replicas are sampling. + +2. REXEE configuration +====================== +Here we consider a REXEE simulation composed of :math:`R` parallel replicas of expanded ensembles, each of which is +labeled as :math:`i=1, 2, ..., R`. These :math:`R` replicas are restricted to sampling :math:`R` different yet overlapping +sets of states (termed **state sets**) labeled by :math:`m` as :math:`{\bf A}_1`, :math:`{\bf A}_2`, ..., :math:`{\bf A}_R`, +which collectively sample :math:`N` alchemical intermediate states in total, with :math:`N > R`. Additionally, we define :math:`s_i \in \{1, 2, ..., N\}` +as the index of the state currently sampled by the :math:`i`-th replica. For a replica :math:`i` sampling state set :math:`{\bf A}_m`, +:math:`s_i` is additionally constrained such that :math:`s_i \in {\bf A}_m`. Importantly, the fact that :math:`s_i` takes values +in :math:`\{1, 2, ..., N\}` and :math:`N>R` implies a many-to-one relationship between the replica index :math:`i` and the state index +:math:`s_i`, as a certain state may be sampled by multiple replicas. This is in contrast to the one-to-one relationship between the replica +index :math:`i` and the state set index :math:`m`, which ensures that each replica is associated with one unique state set. + +Importantly, a valid REXEE configuration only requires overlapping state sets and is not restricted to one-dimensional grids, +the same number of states for all replicas, nor sequential state indices within the same state sets. For example, `Fig. 2`_ shows cases where +intermediate states are characterized by more than one thermodynamic variable (panels A and B), where different state sets +have different numbers of states (panels C), and where the state indices are not consecutive within the same state sets (panels A and C). +Still, the most common case is where the intermediate states are defined in a one-dimensional space, with consecutive state indices within +the same state set (e.g., the case in `Fig. 1`_). In a REXEE simulation with such a configuration, a state shift :math:`\phi` between adjacent +state sets can be defined to indicate to what extent the set of states has shifted along the alchemical coordinate. Depending on whether all replicas +have the same number of states and whether or not the state shift is consistent between all adjacent state sets, a REXEE simulation can be either +homogeneous or heterogeneous. Currently, :code:`ensemble_md` has only implemented the homogeneous REXEE method with one-dimensional alchemical intermediate +states defined sequentially in each state set. + +.. figure:: _static/REXEE_more_configs.png + :name: Fig. 2 + :width: 800 + :align: center + :figclass: align-center + + **Figure 2.** Different possible replica configurations of a REXEE simulation, with each state represented as a grid labeled by the number in its center + and characterized by different Hamiltonians and/or temperatures. Different state sets are represented as dashed lines in different colors. + Note that the temperature :math:`T` and Hamiltonian :math:`H` can be replaced by other physical variables of interest, such as pressure or chemical potential. + +As shown in `Fig. 1`_, a homogeneous REXEE simulation that samples sequential one-dimensional states can be configured by the following four parameters: + + - :math:`N`: The total number of intermediate states + - :math:`R`: The total number of replicas + - :math:`n_s`: The number of states per replica + - :math:`\phi`: The state shift between adjacent state sets + +These four configurational parameters are related via the following relationship: + +.. math:: N = n_s + (R-1)\phi + :label: eq_1 + +For example, the configuration of the REXEE simulation shown in `Fig. 1`_ can be expressed as :math:`(N, R, n_s, \phi) = (9, 4, 6, 1)`. Importantly, the total +number of states :math:`N` does not have to be equal to the number of replicas :math:`R` in the REXEE method. In fact, it is shown in the Supporting Information of +our paper [Hsu2024]_ that for a REXEE simulation simulation sampling any number of replicas, there exists at least one valid REXEE +configuration, allowing much higher flexibility in replica configuration compared to traditional replica exchange methods -- once the number of replicas +is decided, typically as a factor of the number of available cores, the total number of states can be arbitrary. In our Supporting Information, +we also show that solving Equation :eq:`eq_1` with a few additional constraints allows efficient enumeration of all possible REXEE configurations. In :code:`ensemble_md`, +this enumeration is implemented in the command line interface (CLI) command :code:`explore_REXEE`, as elaborated in :ref:`doc_explore_REXEE`. + +3. State transitions in REXEE simulations +========================================= +In a REXEE simulation, state transitions occur at both the intra-replica and inter-replica levels. Within each replica of expanded ensemble simulation, +transitions between alchemical states within the state set and the detailed balance conditions are governed by the selected algorithm in the expanded ensemble simulation +(i.e., the value of the GROMACS MDP parameter :code:`lmc-stats-move` in our implementation). Still, to ensure that probability influx and outflux are equal for each set of states, +the detailed balance condition at the intra-replica level must be satisfied. + +Mathematically, we consider replicas :math:`i` and :math:`j` that sample the state sets :math:`{\bf A}_m` and :math:`{\bf A}_n`, respectively. To swap replicas :math:`i` +and :math:`j`, the state sampled by replica :math:`i` at the moment, denoted as :math:`s_i \in {\bf A}_m`, must fall within the state set :math:`{\bf A}_n` that is to be swapped, +and vice versa. In this case, we call that these replicas :math:`i` and :math:`j` are **swappable**, and we express the exchange of coordinates :math:`x_i` and :math:`x_j` between these +two replicas as + +.. math:: :label: eq_2 + + X=\left(..., x^i_{m}, ..., x^j_{n}, ...\right) \rightarrow X' = \left(..., x^j_{m}, ..., x^i_{n}, ...\right) -.. math:: - p = \frac{\exp(-\beta H(p^{i}, q^{i}))}{\int \exp(-\beta H(p^{i}, q^{i})) dq^{i}}=\frac{\exp(-\beta K(p^{i}) -\beta U(q^i, \lambda^{i}_{m}) + g^{i}_{m})}{\int \exp(-\beta K(p^{i}) -\beta U(q^i, \lambda^{i}_{m}) + g^{i}_{m}) dq^{i}} +with :math:`x^i_m \equiv (x_i, {\bf A}_m)` meaning that the :math:`i`-th replica samples the :math:`m`-th state set with the coordinates :math:`x_i`. Mathematically, the list of swappable pairs +:math:`\mathcal{S}` can be defined as the set of replica pairs as follows: -Let :math:`X=(x^{1}_{m(1)}, x^{2}_{m(2)}, ..., x^{M}_{m(M)})` stand for a "state" in the generalized ensemble -sampled by EXEE, where the superscript and subscript of :math:`x^{i}_{m(i)}` are the labels for the -replica and :math:`\lambda` states, respectively. The state :math:`X` is specified by the :math:`M` sets of -coordinates :math:`q^{i}` and momenta :math:`q^{i}` in replica :math:`i` with a :math:`\lambda` vector -:math:`\lambda^{i}_{m}` such that :math:`x^{i}_{m}\equiv(q^{i}, p^{i})_m`. Given that the replicas of -expanded ensemble are non-interacting, the weight factor for a state :math:`X` in this generalized ensemble -is given by the product of Boltzmann factors for each replica: +.. math:: :label: eq_3 -.. math:: - W(X) = \exp \left [\sum^{M}_{i=1} (-\beta K(p^{i}) -\beta U(q^i, \lambda^{i}_{m}) + g^{i}_{m})\right ] + \mathcal{S} = \left\{(i, j) \mid s_i \in {\bf A}_n, s_j \in {\bf A}_m, i \neq j\right\} -Now, we consider exchanging replicas of expanded ensemble :math:`i` and :math:`j`: +As discussed in the Supporting Information of the paper [Hsu2024]_, the most straightforward way to derive the acceptance ratio that satisfies the intra-replica detailed balance condition +is to assume symmetric proposal probabilities, which can be easily achieved by the design of the used proposal scheme. (See :ref:`doc_proposal` for more details.) +Under this assumption, the acceptance ratio of swapping the coordinates :math:`x_i` and :math:`x_j` between replicas :math:`i` and :math:`j` can be expressed as -.. math:: - X = (..., x_{m}^{i}, ..., x_{n}^{j}, ...) \rightarrow X' = (..., x_{f'(i)}^{i}, ..., x_{f'(j)}^{j}, ...) = (..., x_{n}^{i}, ..., x_{m}^{j}, ...) +.. math:: :label: eq_4 -Notably, such an exchange introduces a new non-injective but surjective function :math:`f'` mapping the label -:math:`i` to the label :math:`m`. (Note that since both functions :math:`f` or :math:`f'` are not bijective, -we don't have their inverse to map the label :math:`m` back to the label :math:`i`.) That is, we have: + P_{\text{acc}} = + \begin{cases} + \begin{aligned} + &1 &, \text{if } \Delta \leq 0 \\ + \exp(&-\Delta) &, \text{if } \Delta >0 + \end{aligned} + \end{cases} -.. math:: - m = f(i) \rightarrow n=f'(i) +where -To ensure the convergence of the exchange process towards an equilibrium distribution, we impose the detailed -balance condition on the transition probability :math:`w(X \rightarrow X')`: +.. math:: :label: eq_5 -.. math:: - W(X)w(X \rightarrow X') = W(X')w(X' \rightarrow X) + \Delta = \left(u_{s_i}(x_j) + u_{s_j}(x_i) \right)-\left(u_{s_i}(x_i)+u_{s_j}(x_j)\right) -Here, we introduce a shorthand expression for the potential energy such that terms like :math:`U(q^i, \lambda^{i}_{m})` -can be rewriteen as :math:`U^i_m`. With this, we have +In Equation :eq:`eq_5`, :math:`u_{s_i}(x_j)` and :math:`u_{s_j}(x_i)` are the reduced potentials of the states :math:`s_i` and :math:`s_j` evaluated at the coordinates :math:`x_j` and :math:`x_i`, respectively. -.. math:: - \begin{aligned} - \frac{w(X \rightarrow X')}{w(X' \rightarrow X)} & = \frac{W(X')}{W(X)} \\ - & = \frac{\exp(-\beta K(p^{i}) -\beta U(q^{i}, \lambda^{i}_{n}) + g^{i}_{n} -\beta K(p^{j}) -\beta U(q^{j}, \lambda^{j}_{m}) + g^{j}_{m})}{\exp(-\beta K(p^{i}) -\beta U(q^{i}, \lambda^{i}_{m}) + g^{i}_{m} -\beta K(p^{j}) -\beta U(q^{j}, \lambda^{j}_{n}) + g^{j}_{n})} \\ - & = \exp(-\beta[(U^i_n + U^j_m) - (U^i_m+U^j_n)] + [(g^i_n+g^j_m)-(g^i_m+g^j_n)]) \\ - \end{aligned} - \label{trans} +.. _doc_proposal: -where +4. Proposal schemes +=================== +In this section, we discuss proposal schemes available in the current implementation of the package :code:`ensemble_md`, +each of which has a symmetric proposal probability. These proposal schemes can be specified via the option :code:`proposal` in the input YAML file (e.g., :code:`params.yaml`) +for running a REXEE simulation. For more details about the input YAML file, please refer to :ref:`doc_parameters`. -.. math:: - \Delta = \beta[(U^i_n + U^j_m) - (U^i_m+U^j_n)] - [(g^i_n+g^j_m)-(g^i_m+g^j_n)] +4.1. Single exchange proposal scheme +------------------------------------ +The single exchange proposal scheme randomly draws a pair of replicas from the list of swappable pairs :math:`\mathcal{S}` defined in :eq:`eq_3`, with each pair in the list +having an equal probability to be drawn. In this case, the proposal probability can be expressed as follows: -Notably, in the equation above, all kinetic energy terms and the Hamiltonians of non-exchanging -replicas are canceled out. Now, using the usual Metropolis criterion, the transition probability -:math:`w(X \rightarrow X')` can be expressed as +.. math:: :label: eq_6 -.. math:: - w(X \rightarrow X') = - \begin{cases} + \alpha\left(X'|X\right)= \alpha\left(x^j_{m}, x^i_{n} | x^i_{m}, x^j_{n}\right)= + \begin{cases} \begin{aligned} - &1 &, \;\text{if} \;\Delta \leq 0 \\ - \exp(&-\Delta) &, \;\text{if} \;\Delta >0 - \end{aligned} + &1/|\mathcal{S}|& \text{, if } (i, j) \in \mathcal{S} \\ + & \quad 0 &\text{, if } (i, j) \notin \mathcal{S} + \end{aligned} \end{cases} -Notably, if the systems of the replicas to be exchanged are sampling the same alchemical -state (namely, :math:`m=n`) right before the exchange occurs, :math:`\Delta` will reduce to -0, meaning the the exchange will always be accepted. +In our implementation in :code:`ensemble_md`, this method can be used by setting :code:`proposal: 'single'` in the input YAML file. -2. Replica swapping -=================== +4.2. Neighbor exchange proposal scheme +-------------------------------------- +In the neighbor exchange proposal scheme implemented in :code:`ensemble_md` (which is enabled by setting :code:`proposal: 'neighbor'` in the input YAML file), +we add a constraint to :math:`mathcal{S}` defined in Equation :eq:`eq_3` such that the swappable pairs consist exclusively of neighboring replicas, +with each pair having an equal probability to be drawn. Formally, the proposal probability in this case can be expressed as +follows: -.. _doc_proposal: +.. math:: :label: eq_7 -2.1. Proposal schemes ---------------------- -In the current implementation, the following proposal schemes are available and can be -specified via the option :code:`proposal` in the input YAML file (e.g. :code`params.yaml`). - -2.1.1. Single swap -~~~~~~~~~~~~~~~~~~ -If the option :code:`proposal` is specified as :code:`single` in the input YAML file, the method -of single swap will be used, in which only a single swap will be randomly drawn from the list of swappable -pairs within an exchange interval. Here, a pair of replicas can be swapped only if the states to be swapped -are present in both of the alchemical ranges of the two replicas. After the pair is drawn, the acceptance -ratio will be calculated given the specified acceptance scheme to decide whether the swap should be accepted. - -2.1.2. Neighboring swap -~~~~~~~~~~~~~~~~~~~~~~~ -If the option :code:`proposal` is specified as :code:`neighboring` in the input YAML file, the method -of neighboring swap will be used. This method is exactly the same as the single swap method execpt that it -additionally requires the swappable pairs to be neighboring replicas. - -2.1.3. Exhaustive swaps -~~~~~~~~~~~~~~~~~~~~~~~ -If the option :code:`proposal` is specified as :code:`exhaustive` in the input YAML file, the method -of exhaustive swaps will be used. In an exchange interval, this method requires repeatedly drawing a pair -from the list of swappable pairs and remove pair(s) involving previously drawn replicas until the list is empty. -For each proposed swap, we calculate the acceptance ratio to decide whether the swap should be accepted or not. -In greater detail, this scheme can be decomposed into the following steps: - - - **Step 1**: Identify the list of swappable pairs. - - **Step 2**: Randomly draw a pair from the list of swappable pairs. - - **Step 3**: Calculate the acceptance ratio for the drawn pair to decide whether the swap should be accepted. - Then, perform or reject the swap. - - **Step 4**: Update the list of swappable pairs by removing pair(s) that involve any replica in the drawn pair in Step 2. - - **Step 5**: Repeat Steps 2 to 4 until the list of swappable pairs is empty. - -Note that - - - In this method, no replicas should be involved in more than one proposed swap. - - Given :math:`N` alchemical intermediate states in total, one can at most perform :math:`\lfloor N \rfloor` swaps with this method. - - While this method can lead to multiple attempted swaps, these swaps are entirely indepdent of each other, which is - different from the method of multiple swaps introduced below. - - Importantly, whether the swap in Step 3 is accepted or rejected does not influence the update of the list in Step 4 at all. - This is different from the method of multiple swaps introduced in the next section, where the updated list of swappable pairs depends on - the acceptance/rejection of the current attempted swap. - - Since all swaps are independent, instead of calculating and acceptance ratio and performing swaps separately (as done in Step 3 in the procedure above), one - can choose to calculates all acceptance ratios for all drawn pairs and perform all swaps at the same time at the end. - We chose to implement the former in :obj:`.get_swapping_pattern` since this is more consistent with the protocol of the other proposal schemes - , hence easier to code. - -.. _doc_multiple_swaps: - -2.1.4. Multiple swaps -~~~~~~~~~~~~~~~~~~~~~ -If the option :code:`proposal` is specified as :code:`multiple` in the input YAML file, the method -of multiple swaps will be used, where the number of swaps can be specified by the :code:`n_ex` parameter in -the input YAML file. If :code:`n_ex` is not specified, :math:`N^3` swaps will be attmpted in an exchange interval, -where :math:`N` is the total number of alchemical intermediate states. For each attempted swap in this method, -one pair will be drawn from the list of swappable pairs (with replacement). Between attempted swaps, the acceptance -ratio is calculated to decide whether the swap should be accepted. Then, if the swap is accepted, the list of -swappable pairs will be updated by re-identifying swappable pairs based on the updated permutation. (That is, the -next attempted swap is dependent on whether the current swap is accepted.) If the swap is rejected, the execution will -end and there won't be a new pair drawn. In greater detail, this scheme can be decomposed into the following steps: - - - **Step 1**: Identify the list of swappable pairs. - - **Step 2**: Randomly draw a pair from the list of swappable pairs. - - **Step 3**: Calculate the acceptance ratio to decide whether the swap drawn in Step 2 should be accepted. - - **Step 4**: If the attempted swap is accepted, update the permutation of the state ranges and update the list of swappable - pairs accordingly. Otherwise, the list of swappable pairs stay the same. - - **Step 5**: Perform Steps 2 and 3 for the desired number of times (specified by :code:`n_ex`). - -.. note:: Except for the method of multiple swaps, all methods introduced above obey the detailed balance condition. - We set the default of the paramter :code:`proposal` as :code:`exhaustive` as it allows higher sampling efficiency. - -.. _doc_acceptance: - -2.2. Acceptance schemes ------------------------ -In the current implementation, 3 acceptance schemes are available to be specified -in the input YAML file (e.g. :code:`params.yaml`) via the option :code:`acceptance`, including :code:`same-state`/:code:`same_state`, -and :code:`metropolis`. In our implementation, -relevant methods include :obj:`.propose_swaps`, :obj:`.calc_prob_acc`, and :obj:`.accept_or_reject`. -Below we elaborate the details of each of the swapping schemes. - -.. _doc_same_state: - -2.2.1. Same-state swapping -~~~~~~~~~~~~~~~~~~~~~~~~~~ -The simplest scheme for swapping replicas is the same-state swapping scheme, which only swaps -replicas only if they both happen to same the same alchemical states right before the swap. That -is, the acceptance ratio is always either :math:`1` (same state) or :math:`0` (different states). -Notably, this swapping scheme does not obey the detailed balance condition. - -2.2.2. Metropolis swapping -~~~~~~~~~~~~~~~~~~~~~~~~~~ -Metropolis swapping uses the Metropolis criterion to swap replicas, i.e. - -.. math:: - w(X \rightarrow X') = - \begin{cases} + \alpha\left(X'|X\right)= \alpha\left(x^j_{m}, x^i_{n} | x^i_{m}, x^j_{n}\right)= + \begin{cases} \begin{aligned} - &1 &, \;\text{if} \;\Delta \leq 0 \\ - \exp(&-\Delta) &, \;\text{if} \;\Delta >0 - \end{aligned} + &1/|\mathcal{S}_{\text{neighbor}}|& \text{, if } (i, j) \in \mathcal{S_{\text{neighbor}}} \\ + & \quad 0 &\text{, if } (i, j) \notin \mathcal{S_{\text{neighbor}}} + \end{aligned} \end{cases} where -.. math:: - \Delta = \beta[(U^i_n + U^j_m) - (U^i_m+U^j_n)] - [(g^i_n+g^j_m)-(g^i_m+g^j_n)] - -In theory, this swapping scheme should obey the detailed balance condition, as derived -in :ref:`doc_basic_idea`. - -2.3. Calculation of Δ in Metropolis-based acceptance schemes ------------------------------------------------------------- -The calculation of :math:`\Delta` is important because the acceptance ratio :math:`w(X\rightarrow X')=\min(1, \exp(-\Delta))` is -directly related to :math:`\Delta`. To better understand how :math:`\Delta` is calculated in the Metropolise-based methods, -we need to first know what's available in the DHDL file of a GROMACS expanded ensemble simulation. As an example, below -we tabulate the data of a DHDL file, with the time column having units of ps and all energy quantities having -units of kJ/mol. - -.. list-table:: - :widths: 10 10 10 10 10 10 10 10 - :header-rows: 1 - - * - Time - - State - - Total energy - - :math:`dH/d\lambda` at :math:`\lambda_{\text{coul}}=0` - - :math:`dH/d\lambda` at :math:`\lambda_{\text{vdw}}=0` - - :math:`\Delta H` w.r.t :math:`(0, 0)` - - :math:`\Delta H` w.r.t :math:`(0, 0.2)` - - ... - * - 0.0 - - 0 - - -16328.070 - - 92.044243 - - -24.358231 - - 6.1035156e-05 - - 18.408772 - - ... - * - 2.0 - - 1 - - -16259.254 - - 69.588318 - - -5.8508954 - - -13.917714 - - -1.5258789e-05 - - ... - * - 4.0 - - 7 - - -16060.098 - - -171.03197 - - -55.529320 - - 86505.967 - - 86471.757 - - ... - * - 6.0 - - 6 - - -16164.012 - - -14.053808 - - 30.875639 - - 65.827232 - - 63.016821 - - ... - * - ... - - ... - - ... - - ... - - ... - - ... - - ... - - ... - -Notably, in the DHDL file, the total energy (i.e. Hamiltonian, denoted as :math:`H` in the table above) -could be the sum of kinetic energy and potential energy or just the total potential energy, depending how the parameter -:code:`dhdl-print-energy` is specified in the MDP file. However, as we will see later, we only care about -:math:`\Delta H`, which should be equal to :math:`\Delta U` regardless of how :code:`dhdl-print-energy` is -specified. This is because at each time frame, the kinetic energy of the system being at differnt :math:`\lambda` -values should be the same and cancelled out. (The kinetic energy is :math:`\lambda`-dependent.) With this, below -we describe more details about the calculation of the difference in the potential energies and the difference in the alchemical weights. - -2.3.1. The calculation of :math:`\beta[(U^i_n + U^j_m) - (U^i_m+U^j_n)]` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Note that for each time frame shown in the table above, there is always one :math:`\Delta H` being 0, which happens when -:math:`\Delta H` is calculated with respect to the state being sampled at that time frame. For example, if the vector of coupling -parameters of state 6 is :math:`(0.5, 0)`, then at :math:`t=6` ps, when the replica is sampling state 6, :math:`\Delta H` w.r.t. :math:`(0.5, 0)` should be 0. This -allows us to get the individual values of :math:`U^{i}_{n}`, :math:`U^{j}_{m}`, :math:`U^{i}_{m}`, and :math:`U^{j}_{n}` by assuming -the state being visited as the reference (i.e. :math:`U=0`). With this, we can calulcate :math:`\beta[(U^i_n + U^j_m) - (U^i_m+U^j_n)]` -with ease. - -As an example, here we assume that we have four replicas labelled as 0, 1, 2, and 3 sampling configurations A, B, C and D and they -ended up at states a, b, c, and d. Schematically, we have -:: - - replica 0 1 2 3 - state a b c d - config A B C D - -When swapping the configurations of replicas 0 and 1, we need to calculate the term :math:`\beta[(U^A_b + U^B_a) - (U^A_a+U^B_b)]`, or equivalently -:math:`\beta[(U^A_b - U^A_a) + (U^B_a-U^B_b)]`. Since now replica 0 is at state a at the end of the simulation, :math:`U^A_b - U^A_a` is immediately -available in the DHDL file in replica 0, which is the final value of :math:`\Delta H` w.r.t :math:`(x, y)`, where :math:`(x, y)` -is the vector of the coupling parameter of state b. - -Now let's say the table above comes from the DHDL file of replica 0. If at :math:`t=6` ps, we are swapping replicas A and B and -:math:`a=6`, :math:`b=0` (i.e. at :math:`t=6` ps, replicas A and B are sampling states 6 and 0, respectively), then :math:`U^A_b - U^A_a=U^A_0 - U^A_6=65.827232`. -Similarly, :math:`U^B_a - U^B_b` can be looked up in the DHDL file of replica 1, so :math:`\Delta` can be calculated. -(Note that we need to convert the units of :math:`\Delta U` from kJ/mol to kT, which is more convenient for the calculation of the acceptance ratio. - -In the case that mutiple swaps are desired (i.e. :code:`proposal` is :code:`multiple`), say :code:`n_ex` is 2, if the first swap between replicas 0 and 1 shown above is accepted and now -we are swapping replicas 1 and 2 in the second swap, then we must be aware that the configurations now corresponding to replicas 0 and 1 is not A and B, -but B and A, repsecitvely: -:: - - replica 0 1 2 3 - state a b c d - config B A C D - -Therefore, when swapping replicas 1 and 2, instead of calculating :math:`\beta[(U^B_c - U^B_b) + (U^C_b-U^C_c)]`, we calculate :math:`\beta[(U^A_c - U^A_b) + (U^C_b-U^C_c)]`. -That is, when swapping replicas 1 and 2 in this case, instead of getting values from the DHDL files of replicas 1 and 2, we actually need to get values from -the DHDL files of reaplicas 0 and 2. In this case, :math:`U^A_c - U^A_b` is not immediately available in the table because configuration A -was at state :math:`a=6`, so the whole vector of :math:`\Delta H` is calculated against state 6. However, as mentioned above, we have -:math:`U^A_6=0`, so we can still calculate :math:`U^A_c - U^A_b` by just taking the difference between :math:`\Delta H` w.r.t. :math:`(x_c, y_c)` -and :math:`\Delta H` w.r.t. :math:`(x_b, y_b)`, where :math:`(x_c, y_c)` and :math:`(x_b, y_b)` are the vectors of coupling parameters of states c and b. -While all this process can sound a little confusing, it has been already taken care of by the function :obj:`.calc_prob_acc`. - -2.3.2. The calculation of :math:`[(g^i_n + g^j_m) - (g^i_m+g^j_n)]` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In the log file of a GROMACS expanded ensemble simulation, the alchemical weights have units of kT, which is why we don't have -the inverse temperature :math:`\beta` multiplied with the weights. Unlike the potential energy terms, in the log file we can find -the individual values of :math:`g^{i}_{n}`, :math:`g^{j}_{m}`, :math:`g^{i}_{m}` and :math:`g^{j}_{n}`. Now say that the log file of replica C -reads below at :math:`t=6` ps: - -:: - - Step Time - 500 6.00000 - - Writing checkpoint, step 500 at Mon Jun 13 02:59:57 2022 - - - MC-lambda information - Wang-Landau incrementor is: 0.32 - N CoulL VdwL Count G(in kT) dG(in kT) - 1 0.000 0.000 18 0.00000 2.94000 - 2 0.250 0.000 11 2.94000 1.26000 - 3 0.500 0.000 13 4.20000 2.10000 << - 4 0.750 0.000 2 6.30000 0.84000 - 5 1.000 0.000 2 7.14000 0.04000 - 6 1.000 0.250 4 7.18000 0.00000 - -Then apparently we have :math:`g^C_0=0` and :math:`g^C_6=7.18` kT, respectively. And the values of -:math:`g^A_0` and :math:`g^A_6` can be found in the log file of replica A, which enables use to -calculate :math:`(g^i_n+g^j_m)-(g^i_m+g^j_n)`. Notably, although it could be interesting to know -the bias difference between different configurations sampling the same state in different alchemical -ranges (i.e. :math:`g^i_n-g^j_n` and :math:`g^j_m-g^i_m`), it does not make sense to calculate such -values from the log file because alchemical weights from in the log files corresponding to simulations -sampling different alchemical ranges would have different references. Therefore, only values such as -:math:`g^i_n-g^i_m` and :math:`g^j_m-g^j_n` make sense, even if they are as interesting as :math:`g^i_n-g^j_n` and :math:`g^j_m-g^i_m`. - -2.4. How is swapping performed? -------------------------------- -As implied in :ref:`doc_basic_idea`, in an REXEE simulation, we could either choose to swap configurations -(via swapping GRO files) or replicas (via swapping MDP files). In this package, we chose the former when -implementing the REXEE algorithm. Specifically, in the CLI :code:`run_REXEE`, the function :obj:`.get_swapping_pattern` -is called once for each iteration and returns a list :code:`swap_pattern` that informs :code:`run_REXEE` how -the GRO files should be swapped. (To better understand the list :code:`swap_pattern`, see the docstring of -the function :obj:`.get_swapping_pattern`.) Internally, the function :obj:`.get_swapping_pattern` not only swaps -the list :code:`swap_pattern` when an attempted move is accepted, but also swaps elements in lists that contains -state shifts, weights, paths to the DHDL files, state ranges, and the attribute :code:`configs`, but not the elements -in the list of states. Check the source code of :obj :`.get_swapping_pattern` if you want to understand the details. +.. math:: :label: eq_8 -.. _doc_w_schemes: + \mathcal{S}_{\text{neighbor}} = \{(i, j)|s_i \in A_n \text{ and } s_j \in A_m \text{ and } |i-j|=1\} + +4.3. Exhaustive exchange proposal scheme +---------------------------------------- +As opposed to the single exchange or neighbor exchange proposal schemes, one can propose +multiple swaps within an exchange interval to further enhance the mixing of replicas. In :code:`ensemble_md`, +one available method is the exhaustive exchange proposal scheme, which can be enabled by setting :code:`proposal: 'exhaustive'` in the input YAML file. +As detailed in Algorithm 1 below, the exhaustive exchange proposal scheme operates similarly to the single exchange proposal scheme, but +exhaustively traverses the list of swappable pairs while updating the list by eliminating any pair involving replicas that +appeared in the previously proposed pair, ensuring symmetric proposal probabilities. Intuitively, the exhaustive exchange proposal +scheme leads to more efficient state-space and replica-space sampling than the other two +proposal schemes, as it potentially allows for more exchanges to occur within an exchange interval. -3. Weight combination +.. figure:: _static/algorithm.png + :width: 800 + :align: center + :figclass: align-center + +| + +.. _doc_correction: + +5. Correction schemes ===================== +For weight-updating REXEE simulations, we experimented with a few correction schemes that aim to improve the convergence of the alchemical weights. +These correction schemes include weight combination and histogram correction schemes, which can be enabled by setting +:code:`w_combine: True` and :code:`hist_corr: True` in the input YAML file, respectively. While there has not been evidence showing that these correction schemes could improve the +weight convergence in REXEE simulations (as discussed in our paper [Hsu2024]_), we still provide these options for users to experiment with. +In the following sections, we elaborate on the details of these correction schemes. + + +.. _doc_w_schemes: + +5.1. Weight combination +----------------------- +In contrast to other generalized ensemble methods such as EE or HREX methods, the REXEE method possesses overlapping states, i.e., +the states that fall within the intersection of at least two state sets and are therefore accessible by multiple replicas. To leverage the statistics of +these overlapping states, we could combine the alchemical weights of these states across replicas before +initializing the next iteration. The hypothesis is that such on-the-fly modifications to the weights could potentially +accelerate the convergence of the alchemical weights and provide a better starting point for the subsequent production phase. +Noting that there are various ways to combine the weights of overlapping states across replicas, the simple approach we have implemented in :code:`ensemble_md` +is to simply calculate the average of the weight differences accessible by multiple replicas, and reassign weights based on these averages to the overlapping states. +This average can be either a simple average or an inverse-variance weighted average, which is less sensitive to the presence of outliers in the weight differences. +Mathematically, we write the weight difference between the states :math:`s` and :math:`s+1` in replica :math:`i$` as :math:`\Delta g_{(s, s+1)}^i=g^i_{s+1}-g^i_{s}`, +and the set of replicas that can access both :math:`s` and :math:`s+1` as :math:`\mathcal{Q}_{(s, s+1)}`. Then, for the case where the inverse-variance weighting is used, +we have the averaged weight difference between :math:`s` and :math:`s+1` as: + +.. math:: :label: eq_9 + + \overline{\Delta g_{(s, s+1)}} = \dfrac{\sum_{k \in \mathcal{Q}_{(s, s+1)}}\left( \Delta g^{k}_{(s, s+1)}\middle/\left(\sigma^k_{(s, s+1)}\right)^2\right)}{\sum_{k \in \mathcal{Q}_{(s, s+1)}} \left. 1\middle/\left(\sigma^k_{(s, s+1)}\right)^2\right.}\label{w_combine} + +with its propagated error being + +.. math:: :label: eq_10 + + \delta_{(s, s+1)} = \sqrt{\left(\sum_{k\in\mathcal{Q}_{(s, s+1)}}\left(\sigma^k_{(s, s+1)}\right)^{-2}\right)^{-1}}\label{w_combine_err} + +where :math:`\sigma^k_{(s, s+1)}` is the standard deviation calculated from the time series of :math:`\Delta g^{k}_{(s, s+1)}` since the last update of the Wang-Landau incrementor +in the EE simulation sampling the :math:`k`-th state set. For a more detailed demonstration of weight combinations, please refer to the example below. + +.. collapse:: An example of weight combination + + Here we consider the following sets of weights + as an example, with :code:`X` denoting a state not present in the state set: + + :: + + State 0 1 2 3 4 5 + Rep A 0.0 2.1 4.0 3.7 X X + Rep B X 0.0 1.7 1.2 2.6 X + Rep C X X 0.0 -0.4 0.9 1.9 + + As shown above, the three replicas sample different but overlapping states. Now, our goal + is to + + * For state 1, combine the weights arcoss replicas 1 and 2. + * For states 2 and 3, combine the weights across all three replicas. + * For state 4, combine the weights across replicas 1 and 2. + + That is, we combine weights arcoss all replicas that sample the state of interest regardless of + which replicas are swapping. The outcome of the whole process should be three vectors of modified + alchemical weights, one for each replica, that should be specified in the MDP files for the next iteration. + Below we elaborate on the details of each step carried out by our method implemented in :code:`ensemble_md`. + + First, we calculate the weight differences as shown below, which can be regarded as rough estimates + of free energy differences between the adjacent states. + + :: + + States (0, 1) (1, 2) (2, 3) (3, 4) (4, 5) + Rep 1 2.1 1.9 -0.3 X X + Rep 2 X 1.7 -0.5 1.4 X + Rep 3 X X -0.4 1.3 1.0 + + Note that to calculate the difference between, say, states 1 and 2, from a certain replica, + both these states must be present in the alchemical range of the replica. Otherwise, a free + energy difference can't not be calculated and is denoted with :code:`X`. Then, for the weight differences that are available in more than 1 replica, we take the simple + average of the weight differences. That is, we have: + + :: + + States (0, 1) (1, 2) (2, 3) (3, 4) (4, 5) + Final 2.1 1.8 -0.4 1.35 1.0 + + Assigning the first state as the reference that has a weight of 0, we have the following profile: + + :: + + Final g 0.0 2.1 3.9 3.5 4.85 5.85 + + Notably, In our implementation in :code:`ensemble_md` (or more specifically, the function :obj:`.combine_weights` in the class :obj:`.ReplicaExchangeEE` in :obj:`.replica_exchange_EE`), + `inverse-variance weighted averages`_ can be used instead of simple averages used above, in which case uncertainties of the input weights (e.g., calculated as the standard + deviation of the weights since the last update of the Wang-Landau incrementor) are required. + + .. _`inverse-variance weighted averages`: https://en.wikipedia.org/wiki/Inverse-variance_weighting + + Finally, we need to determine the vector of alchemical weights for each replica. To do this, + we just shift the weight of the first state of each replica back to 0. As a result, we have + the following vectors: + + :: + + State 0 1 2 3 4 5 + Rep 1 0.0 2.1 3.9 3.5 X X + Rep 2 X 0.0 1.8 1.4 2.75 X + Rep 3 X X 0.0 -0.4 0.95 1.95 + + As a reference, here are the original weights: + + :: + + State 0 1 2 3 4 5 + Rep 1 0.0 2.1 4.0 3.7 X X + Rep 2 X 0.0 1.7 1.2 2.6 X + Rep 3 X X 0.0 -0.4 0.9 1.9 + + Notably, taking the simple average of weight differences/free energy differences is equivalent to + taking the geometric average of the probability ratios. + +| + +.. _doc_weight_correction: + +5.2. Weight correction +---------------------- +In the weight combination method shown above, we frequently exploited the relationship :math:`g(\lambda)=f(\lambda)=-\ln p(\lambda)`. +However, this relationship is true only when the histogram of state visitation is exactly flat, which rarely happens in reality. +To correct this deviation, we can convert the difference in the histogram counts into the difference in free energies. This is based on the +fact that the ratio of histogram counts is equal to the ratio of probabilities, whose natural logarithm is equal +to the free energy difference of the states of interest. Specifically, we have: + +.. math:: :label: eq_11 + + g'_k = g_k + \ln\left(\frac{N_{k-1}}{N_k}\right) + +where :math:`g'_k` is the corrected alchemical weight of state :math:`k`, and :math:`N_{k-1}` and :math:`N_k` are the histogram counts of states :math:`k-1` and :math:`k`, respectively. + +.. _doc_hist_correction: + +5.3. Histogram correction +------------------------- +In our experiment, we have also tried applying histogram corrections upon weight combination across replicas to +correct the effect that the targeting distribution is a function of time. The idea is to leverage the more +reliable statistics we get from the overlapping states. In a limiting case where we have two weight-updating +EE simulations sampling the same set of states, the way we take full advantage of all the samples collected +in two simulations is to consider the histogram of both simulations and base the flatness criteria on the +sum of the histograms from both simulations, in which case the weights should then equilibrate faster +than a single weight-updating EE simulation. Click the example below to see a more detailed demonstration/derivation +of the histogram correction approach implemented in :code:`ensemble_md`. + +.. collapse:: An example of histogram correction + + Here, we consider replicas A and B sampling states 0 to 4 and states 1 to 5, respectively. At time :math:`t`, + these two replicas have the following weights for their state sets. + + :: + + Alchemical weights + + State 0 1 2 3 4 5 + Rep A 0 4.00 10.69 12.18 12.52 X + Rep B X 0 5.15 7.14 8.48 8.16 + + And the histogram counts at time :math:`t` are as follows + + :: + + Histogram counts + State 0 1 2 3 4 5 + Rep A 416 332 130 71 61 X + Rep B X 303 181 123 143 260 + + + During weight combination, for states 1 and 2, we calculate the following average weight difference: + + .. math:: :label: eq_12 + + \Delta f'_{21}=\frac{1}{2}\left(\Delta f^{A}_{21} + \Delta f^{B}_{21}\right)=\frac{1}{2}\left(\ln\left(\frac{p^A_1}{p^A_2}\right) +\ln\left(\frac{p^B_1}{p^B_2}\right)\right)=\ln\left[\left(\frac{p^A_1 p^B_1}{p^A_2 p^B_2}\right)^{1/2}\right] + + Let :math:`\Delta f'_{21}=\ln\left(\frac{N_1'}{N_2'}\right)`. We then have + + .. math:: :label: eq_13 + + \frac{N_1'}{N_2'}=\left(\frac{p^A_1 p^B_1}{p^A_2 p^B_2}\right)^{1/2} + + In synchronous REXEE simulations, each replica should have the same total amount of counts, so the normalization constant for replicas A + and B are the same, i.e., :math:`p^A_1 = N^A_1/N`, :math:`p^B_1 = N^B_1/N`, ... etc. Therefore, we have + + .. math:: :label: eq_14 + + \frac{N_1'}{N_2'}=\left(\frac{N^A_1 N^B_1}{N^A_2 N^B_2}\right)^{1/2} + + That is, the ratio of corrected histogram counts should be the geometric mean of the ratios of the original histogram counts. + Using the numbers above, we calculate the ratios of histogram counts for all neighboring states. That is, for states accessible by multiple replicas, we + calculate the geometric mean of the values of :math:`N_{i+1}/N_i` from different replicas. Otherwise, we simply calculate :math:`N_{i+1}/N_i`. + + :: + + States (0, 1) (1, 2) (2, 3) (3, 4) (4, 5) + Final 0.798 0.484 0.609 0.999 1.818 + + + Note that given :math:`N_1`, we can get :math:`N_2` by calculating :math:`N_1 \times \frac{N_2}{N_1}` and get :math:`N_2` by calculating + :math:`N_1 \times \frac{N_2}{N_1} \times \frac{N_3}{N_2}` and so on. So the histogram counts of the whole set of states would be as follows: + + :: + + Final N 416 332 161 98 98 178 + + + The above distribution can be used to derive the distribution for each state set easily: + + :: + + States 0 1 2 3 4 5 + Rep A 416 332 161 98 98 + Rep B X 332 161 98 98 178 + +| -3.1. Basic idea ---------------- -To leverage the stastics of the states collected from multiple replicas, we recommend -combining the alchemical weights of these states across replicas during an weight-updating REXEE simulation. -Ideally, the modified weights should facilitate the convergence of the alchemical weights in expanded ensemble, -which in the limit of inifinite simulation time correspond to dimensionless free energies of the alchemical states. -The modified weights also directly influence the the accpetance ratio, hence the convergence of the simulation -ensemble. There are various possible ways to combine weights across replicas, though some of them might -have the issue of reference-dependent results, or the issue of coupling overlapped and non-overlapped states. -Here, we present the most straightforward method that circumvent these two issues. This method can be enabled by -setting the parameter :code:`w_combine` to `True` in the input YAML file. - -3.2. The details of the method ------------------------------- -Generally, weight combination is performed after the final configurations have beeen figured out and it is just for -the initialization of the MDP files for the next iteration. Now, to demonstrate the method implemented in -:code:`ensemble_md` (or more specifically, :obj:`.combine_weights`), here we consider the following sets of weights -as an example, with :code:`X` denoting a state not present in the alchemical range: - -:: - - State 0 1 2 3 4 5 - Rep A 0.0 2.1 4.0 3.7 X X - Rep B X 0.0 1.7 1.2 2.6 X - Rep C X X 0.0 -0.4 0.9 1.9 - -As shown above, the three replicas sample different but overlapping states. Now, our goal -is to - -* For state 1, combine the weights arcoss replicas 1 and 2. -* For states 2 and 3, combine the weights across all three replicas. -* For state 4, combine the weights across replicas 1 and 2. - -That is, we combine weights arcoss all replicas that sample the state of interest regardless -which replicas are swapping. The outcome of the whole process should be three vectors of modified -alchemical weights, one for each replica, that should be specified in the MDP files for the next iteration. -Below we elaborate the details of each step carried out by our method. - -Step 1: Calculate the weight difference between adjacent states -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -First, we calculate the weight differences, which can be regarded rough estimates -of free energy differences, between the adjacent states. We therefore have: - -:: - - States (0, 1) (1, 2) (2, 3) (3, 4) (4, 5) - Rep 1 2.1 1.9 -0.3 X X - Rep 2 X 1.7 -0.5 1.4 X - Rep 3 X X -0.4 1.3 1.0 - -Note that to calculate the difference between, say, states 1 and 2, from a certain replica, -both these states must be present in the alchemical range of the replica. Otherwise, a free -energy difference can't not be calculated and is denoted with :code:`X`. - -Step 2: Take the average of the weight differences across replicas -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Then, for the weight differences that are available in more than 1 replica, we take the simple -average of the weight differences. That is, we have: - -:: - - States (0, 1) (1, 2) (2, 3) (3, 4) (4, 5) - Final 2.1 1.8 -0.4 1.35 1.0 - -Assigning the fist state as the reference, we have the following profile: - -:: - - Final g 0.0 2.1 3.9 3.5 4.85 5.85 - -Notably, a weighted average is typically preferred as it is less sensitive to poor estimates. (See -the section of free energy calculations, where we use basically the same method as the one used here -except that weighte average are calculated.) However, a weighted average requires the uncertainties -of the involved weight differences and calculating the uncertainties of the weight difference (which -basically are estimates of free energy differences) is to computationally expensive, so we only calculate -simple averages when combining the weights. - -Step 3: Determine the vector of alchemical weights for each replica -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Finally, we need to determine the vector of alchemical weights for each replica. To do this, -we just shift the weight of the first state of each replica back to 0. As a result, we have -the following vectors: - -:: - - State 0 1 2 3 4 5 - Rep 1 0.0 2.1 3.9 3.5 X X - Rep 2 X 0.0 1.8 1.4 2.75 X - Rep 3 X X 0.0 -0.4 0.95 1.95 - -Again, as a reference, here are the original weights: - -:: - - State 0 1 2 3 4 5 - Rep 1 0.0 2.1 4.0 3.7 X X - Rep 2 X 0.0 1.7 1.2 2.6 X - Rep 3 X X 0.0 -0.4 0.9 1.9 - -Notably, taking the simple average of weight differences/free energy differences is equivalent to -taking the geometric average of the probability ratios. - -.. _doc_histogram: - -3.3. Histogram corrections --------------------------- -In the weight-combining method shown above, we frequently exploted the relationship :math:`g(\lambda)=f(\lambda)=-\ln p(\lambda)`. -However, this relationship is true only when the histogram of state vistation is exactly flat, which rarely happens in reality. -To correct this deviation, we can convert the difference in the histogram counts into the difference in free energies. This is based -on the fact that the ratio of histogram counts is equal to the ratio of probabilities, whose natural -logarithm is equal to the free energy difference of the states of interest. Specifically, we have: +5.3. Our experience with the correction schemes +----------------------------------------------- +As per our experiments with the correction schemes (also partly discussed in our paper [Hsu2024]_), +here are some interesting observations about the correction schemes: -.. math:: - g_k'=g_k + \ln\left(\frac{N_{k-1}}{N_k}\right) +- Generally, the application of weight combination schemes would lengthen the weight convergence time for a + weight-updating REXEE simulation, without necessarily converging to more accurate weights, as compared to running + a weight-updating EE simulation for each state set. +- Interestingly, in terms of the weight convergence time and the weight accuracy, here is the ranking of the performance + of different combinations of the correction schemes, with the best performance listed first: -where :math:`g_k'` is the corrected alchemical weight and :math:`N_{k-1}` and :math:`N_k` are the histogram -counts of states :math:`k-1` and :math:`k`. + - No correction schemes at all, i.e., weight-updating EE simulation for each state set. + - Weight combination with simple averages + histogram correction + - Weight combination with simple averages + - Weight combination with inverse-variance weighted averages +- We reason these observations that combining weights does not improve convergence is because the exchanges of coordinates between replicas have already caused each + replica to visit all of the configurations that started with different replicas, and thus have + “seen” the different configurations and incorporated them into the accumulated weights. + Therefore, additionally combining weights across replicas may not provide any additional + advantage. In addition, small changes in weights can drastically affect sampling, + as state probabilities are exponential in the free energy differences between states. + If one of the weights being combined is particularly bad, it will disrupt sampling for the other + weights, and will therefore lower the convergence rate. For more details about the experiments, please refer to our paper [Hsu2024]_. -Notably, this correction method can possibly overcorrect the weights when the histogram counts :math:`N_k` or :math:`N_{k-1}` are too low. -To deal with this, the user can choose to specify :code:`N_cutoff` in the input YAML file, so that the the histogram -correction will performed only when :math:`\text{argmin}(N_k, N_{k-1})` is larger than the cutoff. Also, this histogram correction -should always be carried out before weight combination. This method is implemented in :obj:`.histogram_correction`. +6. Free energy calculations +=========================== +The free energy calculation protocol of the REXEE method is basically the same as those for the EE and HREX methods. +Specifically, for each state set in a fixed-weight REXEE simulation, we concatenate the trajectories from all replicas, truncate the non-equilibrium +region, and then decorrelate the concatenated data. Then, for each replica in the fixed-weight REXEE simulation, one can use +free energy estimators such as TI, BAR, and MBAR to calculate the alchemical free energies for different state sets. +For the overlapping states, one can then use Equations :eq:`eq_9` and :eq:`eq_10` to calculate the mean of the associated free energy differences +:math:`\overline{\Delta G_{(s, s+1)}}` and the accompanying propagated error :math:`\delta_{(s, s+1)}`, with :math:`\Delta g^k_{(s, s+1)}` +replaced by :math:`\Delta G^k_{(s, s+1)}`, the free energy difference computed by the chosen free energy estimator. In this context, +:math:`\sigma^k_{(s, s+1)}` used in Equations :eq:`eq_9` and :eq:`eq_10` should be the uncertainty associated with :math:`\Delta G^k_{(s, s+1)}` +calculated by the estimator. In :code:`ensemble_md`, this has been implemented in the function :func:`.calculate_free_energy` in :obj:`.analyze_free_energy`. -4. Parameter space of REXEE -=========================== \ No newline at end of file diff --git a/ensemble_md/analysis/analyze_matrix.py b/ensemble_md/analysis/analyze_matrix.py index 2b226f5f..bd70a063 100644 --- a/ensemble_md/analysis/analyze_matrix.py +++ b/ensemble_md/analysis/analyze_matrix.py @@ -28,7 +28,7 @@ def calc_transmtx(log_file, simulation_type='EE'): Parameters ---------- log_file : str - The file path to the log file to be parsed. + The file path of the log file to be parsed. simulation_type : str, Optional The type of simulation. It can be either a :code:`EE` (expanded ensemble) or :code:`HREX` (Hamiltonian replica exchange) simulation. The default is :code:`EE`. diff --git a/ensemble_md/analysis/analyze_traj.py b/ensemble_md/analysis/analyze_traj.py index 0d1619e8..7119dde3 100644 --- a/ensemble_md/analysis/analyze_traj.py +++ b/ensemble_md/analysis/analyze_traj.py @@ -29,7 +29,7 @@ def extract_state_traj(dhdl): Parameters ---------- dhdl : str - The file path to the GROMACS DHDL file to be parsed. + The file path of the GROMACS DHDL file to be parsed. Returns ------- @@ -230,7 +230,7 @@ def stitch_xtc_trajs(gmx_executable, files, rep_trajs): Parameters ---------- gmx_executable : str - The path to the GROMACS executable. + The path of the GROMACS executable. files : list A list of lists of file paths to GROMACS XTC files. Specifically, :code:`files[i]` should be a list containing the paths to the files of interest from all iterations in replica :code:`i`. The files should be sorted @@ -1260,7 +1260,7 @@ def get_delta_w_updates(log_file, plot=False): Parameters ---------- log_file : str - The file path to the LOG file. + The file path of the LOG file. plot : bool, Optional Whether to plot the Wang-Landau incrementor as a function of time. The default is :code:`False`. diff --git a/ensemble_md/analysis/clustering.py b/ensemble_md/analysis/clustering.py index 51f9f1db..ef58900e 100644 --- a/ensemble_md/analysis/clustering.py +++ b/ensemble_md/analysis/clustering.py @@ -22,7 +22,7 @@ def cluster_traj(gmx_executable, inputs, grps, coupled_only=True, method='linkag Parameters ---------- gmx_executable : str - The path to the GROMACS executable. + The path of the GROMACS executable. inputs : dict A dictionary that contains the different input files required for the clustering analysis. The dictionary must have the following four keys: :code:`traj` (input trajectory file in diff --git a/ensemble_md/analysis/msm_analysis.py b/ensemble_md/analysis/msm_analysis.py index 309927a0..98333fa6 100644 --- a/ensemble_md/analysis/msm_analysis.py +++ b/ensemble_md/analysis/msm_analysis.py @@ -20,6 +20,7 @@ warnings.warn('This module is still a work in progress. Please note that there are no immediate plans to expand or rigorously test this module.', UserWarning) # noqa: E501 + def plot_acf(models, n_tot, fig_name): """ Plots the state index autocorrelation times for all configurations in a single plot. diff --git a/ensemble_md/cli/analyze_REXEE.py b/ensemble_md/cli/analyze_REXEE.py index 7dcdead6..9e1d3a4f 100644 --- a/ensemble_md/cli/analyze_REXEE.py +++ b/ensemble_md/cli/analyze_REXEE.py @@ -35,47 +35,48 @@ def initialize(args): parser = argparse.ArgumentParser( - description='This code analyzes a REXEE simulation. Note that the template MDP\ - file specified in the YAML file needs to be available in the working directory.') + description='This CLI analyzes a REXEE simulation.') parser.add_argument('-y', '--yaml', type=str, default='params.yaml', - help='The input YAML file used to run the REXEE simulation. (Default: params.yaml)') + help='The file path of the input YAML file used to run the REXEE simulation. \ + (Default: params.yaml)') parser.add_argument('-o', '--output', type=str, default='analyze_REXEE_log.txt', - help='The output log file that contains the analysis results of REXEE. \ + help='The file path of the output log file that contains the analysis results of REXEE. \ (Default: analyze_REXEE_log.txt)') parser.add_argument('-rt', '--rep_trajs', type=str, default='rep_trajs.npy', - help='The NPY file containing the replica-space trajectory. (Default: rep_trajs.npy)') + help='The file path of the NPY file containing the replica-space trajectory. \ + (Default: rep_trajs.npy)') parser.add_argument('-st', '--state_trajs', type=str, default='state_trajs.npy', - help='The NPY file containing the stitched state-space trajectory. \ + help='The file path of the NPY file containing the stitched state-space trajectory. \ If the specified file is not found, the code will try to find all the trajectories and \ stitch them. (Default: state_trajs.npy)') parser.add_argument('-sts', '--state_trajs_for_sim', type=str, default='state_trajs_for_sim.npy', - help='The NPY file containing the stitched state-space time series for different alchemical\ - ranges. If the specified file is not found, the code will try to find all the \ - time series and stitch them. (Default: state_trajs.npy)') + help='The file path of the NPY file containing the stitched state-space time series for \ + different state sets. If the specified file is not found, the code will try to find \ + all the time series and stitch them. (Default: state_trajs.npy)') parser.add_argument('-d', '--dir', default='analysis', - help='The name of the folder for storing the analysis results.') + help='The path of the folder for storing the analysis results. (Default: analysis)') parser.add_argument('-m', '--maxwarn', type=int, default=0, - help='The maximum number of warnings in parameter specification to be ignored.') + help='The maximum number of warnings in parameter specification to be ignored. (Default: 0)') args_parse = parser.parse_args(args) return args_parse diff --git a/ensemble_md/cli/explore_REXEE.py b/ensemble_md/cli/explore_REXEE.py index 886e798c..77a74b2b 100644 --- a/ensemble_md/cli/explore_REXEE.py +++ b/ensemble_md/cli/explore_REXEE.py @@ -17,9 +17,9 @@ def initialize(args): parser = argparse.ArgumentParser( - description='This code explores the parameter space of homogenous REXEE to help you figure \ + description='This CLI explores the parameter space of a homogenous REXEE simulation to help you figure \ out all possible combinations of the number of replicas, the number of \ - states in each replica, and the number of overlapping states, and the total number states.') + states in each replica, and the number of overlapping states, given the total number of states.') parser.add_argument('-N', '--N', required=True, @@ -32,7 +32,7 @@ def initialize(args): parser.add_argument('-n', '--n', type=int, - help='The number of states for each replica.') + help='The number of states per replica.') parser.add_argument('-s', '--s', type=int, @@ -41,14 +41,14 @@ def initialize(args): '--cnst', default=False, action='store_true', - help='Whether the apply the constraint such that the number of overlapping states \ - does not exceed 50%% of the number of states in both overlapping replicas.') + help='Whether to apply the constraint such that the number of overlapping states \ + does not exceed 50%% of the number of states in adjacent replicas. (Default: False)') parser.add_argument('-e', '--estimate', default=False, action='store_true', help='Whether to provide estimates of the chance of not having any swappable \ - pairs for each solution.') + pairs for each solution. (Default: False)') args_parse = parser.parse_args(args) return args_parse diff --git a/ensemble_md/cli/run_REXEE.py b/ensemble_md/cli/run_REXEE.py index 6d63bc2c..2337a22c 100644 --- a/ensemble_md/cli/run_REXEE.py +++ b/ensemble_md/cli/run_REXEE.py @@ -24,36 +24,37 @@ def initialize(args): parser = argparse.ArgumentParser( - description='This code runs a REXEE simulation given necessary inputs.') + description='This CLI runs a REXEE simulation given necessary inputs.') parser.add_argument('-y', '--yaml', type=str, default='params.yaml', - help='The input YAML file that contains REXEE parameters. (Default: params.yaml)') + help='The file path of the input YAML file that contains REXEE parameters. \ + (Default: params.yaml)') parser.add_argument('-c', '--ckpt', type=str, default='rep_trajs.npy', - help='The NPY file containing the replica-space trajectories. This file is a \ - necessary checkpoint file for extending the simulaiton. (Default: rep_trajs.npy)') + help='The file path of the NPY file containing the replica-space trajectories. This file is a \ + necessary checkpoint file for extending the simulation. (Default: rep_trajs.npy)') parser.add_argument('-g', '--g_vecs', type=str, default='g_vecs.npy', - help='The NPY file containing the timeseries of the whole-range alchemical weights. \ - This file is a necessary input if ones wants to update the file when extending \ - the simulation. (Default: g_vecs.npy)') + help='The file path of the NPY file containing the time series of the whole-range\ + alchemical weights. This file is a necessary input if one wants to update the \ + file when extending a weight-updating simulation. (Default: g_vecs.npy)') parser.add_argument('-o', '--output', type=str, default='run_REXEE_log.txt', - help='The output file for logging how replicas interact with each other. \ + help='The file path of the output file for logging how replicas interact with each other. \ (Default: run_REXEE_log.txt)') parser.add_argument('-m', '--maxwarn', type=int, default=0, - help='The maximum number of warnings in parameter specification to be ignored.') + help='The maximum number of warnings in parameter specification to be ignored. (Default: 0)') args_parse = parser.parse_args(args) return args_parse diff --git a/ensemble_md/replica_exchange_EE.py b/ensemble_md/replica_exchange_EE.py index 9d83444c..24baf4e6 100644 --- a/ensemble_md/replica_exchange_EE.py +++ b/ensemble_md/replica_exchange_EE.py @@ -621,7 +621,7 @@ def get_ref_dist(self, pullx_file=None): Parameters ---------- pullx_file : str, Optional - The path to the pullx file whose initial value will be used as the reference distance. + The path of the pullx file whose initial value will be used as the reference distance. Usually, this should be the path of the pullx file of the first iteration. The default is :code:`sim_0/iteration_0/pullx.xvg`. """ diff --git a/ensemble_md/utils/gmx_parser.py b/ensemble_md/utils/gmx_parser.py index 9cddacc6..95e671bc 100644 --- a/ensemble_md/utils/gmx_parser.py +++ b/ensemble_md/utils/gmx_parser.py @@ -181,7 +181,7 @@ class MDP(odict): Parameters ---------- input_mdp : str, Optional - The path to the input MDP file. The default is None. + The path of the input MDP file. The default is None. **kwargs : Optional Additional keyword arguments to be passed to add additional key-value pairs to the MDP instance. Note that no sanity checks will be performed for the key-value pairs passed in this way. This @@ -195,7 +195,7 @@ class MDP(odict): PARAMETER : :code:`re.Pattern` object A compiled regular expression pattern for parameters in MDP files. input_mdp : str - The real path to the input MDP file returned by :code:`os.path.realpath(input_mdp)`, + The real path of the input MDP file returned by :code:`os.path.realpath(input_mdp)`, which resolves any symbolic links in the path. Example diff --git a/ensemble_md/utils/utils.py b/ensemble_md/utils/utils.py index e74f5579..f56f3535 100644 --- a/ensemble_md/utils/utils.py +++ b/ensemble_md/utils/utils.py @@ -28,7 +28,7 @@ class Logger: Parameters ---------- logfile : str - The file path to which the standard output and standard error should be logged. + The file path of which the standard output and standard error should be logged. Attributes ----------