From 1ff15fe99b6faee41af097cdea074f1dc7acd45e Mon Sep 17 00:00:00 2001 From: Zhiyi Wu Date: Mon, 12 Jun 2023 22:04:20 +0100 Subject: [PATCH] The statistical inefficiency is logged at the debug level (#325) - close #295 - log SI (at DEBUG) - updated docs - updated CHANGES --------- Co-authored-by: Zhiyi Wu --- CHANGES | 2 ++ docs/parsing.rst | 3 +++ src/alchemlyb/preprocessing/subsampling.py | 8 ++++++++ src/alchemlyb/tests/test_preprocessing.py | 21 +++++++++++++++++++++ 4 files changed, 34 insertions(+) diff --git a/CHANGES b/CHANGES index d5f6b9fc..c3ed0b3e 100644 --- a/CHANGES +++ b/CHANGES @@ -22,6 +22,8 @@ Changes - Use loguru instead of logging for log (issue #301, PR #303). Enhancements + - "Statistical inefficiency" is logged at debug level for equilibrium_detection + and statistical_inefficiency (issue #295, PR#325). - Add a parser to read serialised pandas dataframe (parquet) (issue #316, PR#317). - workflow.ABFE allow parquet as input (issue #316, PR#317). - Allow MBAR estimator to use bootstrap to compute error (issue #320, PR#322). diff --git a/docs/parsing.rst b/docs/parsing.rst index d107278e..73de3b68 100644 --- a/docs/parsing.rst +++ b/docs/parsing.rst @@ -72,6 +72,9 @@ a :class:`pandas.DataFrame`. For loading alchemlyb data we provide the new_u_nk = extract_u_nk('u_nk.parquet', T=300) new_dHdl = extract_dHdl('dHdl.parquet', T=300) +.. Note:: + Serialization of :class:`pandas.DataFrame` to `parquet` file is only allowed + for `pandas>=2`, whereas the deserialization is permitted for any pandas version. .. _dHdl: diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py index ffe39704..4633a87e 100644 --- a/src/alchemlyb/preprocessing/subsampling.py +++ b/src/alchemlyb/preprocessing/subsampling.py @@ -7,6 +7,7 @@ from pymbar.timeseries import detect_equilibration as _detect_equilibration from pymbar.timeseries import statistical_inefficiency as _statistical_inefficiency from pymbar.timeseries import subsample_correlated_data as _subsample_correlated_data +from loguru import logger from .. import pass_attrs @@ -516,12 +517,15 @@ def statistical_inefficiency( df = slicing(df, lower=lower, upper=upper, step=step) # calculate statistical inefficiency of series (could use fft=True but needs test) + logger.debug("Running statistical inefficiency analysis.") statinef = _statistical_inefficiency(series) + logger.debug("Statistical inefficiency: {:.2f}.", statinef) # use the subsample_correlated_data function to get the subsample index indices = _subsample_correlated_data( series, g=statinef, conservative=conservative ) + logger.debug("Number of uncorrelated samples: {}.", len(indices)) df = df.iloc[indices] else: df = slicing(df, lower=lower, upper=upper, step=step) @@ -592,12 +596,16 @@ def equilibrium_detection( df = slicing(df, lower=lower, upper=upper, step=step) # calculate statistical inefficiency of series, with equilibrium detection + logger.debug("Running equilibration detection.") t, statinef, Neff_max = _detect_equilibration(series.values) + logger.debug("Start index: {}.", t) + logger.debug("Statistical inefficiency: {:.2f}.", statinef) series_equil = series[t:] df_equil = df[t:] indices = _subsample_correlated_data(series_equil, g=statinef) + logger.debug("Number of uncorrelated samples: {}.", len(indices)) df = df_equil.iloc[indices] else: df = slicing(df, lower=lower, upper=upper, step=step) diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py index 766bc3df..00e3c030 100644 --- a/src/alchemlyb/tests/test_preprocessing.py +++ b/src/alchemlyb/tests/test_preprocessing.py @@ -1,6 +1,8 @@ """Tests for preprocessing functions. """ +import logging + import numpy as np import pytest from numpy.testing import assert_allclose @@ -523,3 +525,22 @@ def test_u_nk2series_deprecated(self, u_nk, methodargs, reference): def test_other_method_ValueError(self, u_nk): with pytest.raises(ValueError, match="Decorrelation method bogus not found."): u_nk2series(u_nk, method="bogus") + + +class TestLogging: + def test_detect_equilibration(self, caplog, u_nk): + with caplog.at_level(logging.DEBUG): + decorrelate_u_nk(u_nk, remove_burnin=True) + + assert "Running equilibration detection." in caplog.text + assert "Start index:" in caplog.text + assert "Statistical inefficiency:" in caplog.text + assert "Number of uncorrelated samples:" in caplog.text + + def test_statistical_inefficiency(self, caplog, u_nk): + with caplog.at_level(logging.DEBUG): + decorrelate_u_nk(u_nk) + + assert "Running statistical inefficiency analysis." in caplog.text + assert "Statistical inefficiency:" in caplog.text + assert "Number of uncorrelated samples:" in caplog.text