From 1ff15fe99b6faee41af097cdea074f1dc7acd45e Mon Sep 17 00:00:00 2001
From: Zhiyi Wu <zwu@exscientia.ai>
Date: Mon, 12 Jun 2023 22:04:20 +0100
Subject: [PATCH] The statistical inefficiency is logged at the debug level
 (#325)

- close #295
- log SI (at DEBUG)
- updated docs
- updated CHANGES

---------

Co-authored-by: Zhiyi Wu <zwu@exscientia.co.uk>
---
 CHANGES                                    |  2 ++
 docs/parsing.rst                           |  3 +++
 src/alchemlyb/preprocessing/subsampling.py |  8 ++++++++
 src/alchemlyb/tests/test_preprocessing.py  | 21 +++++++++++++++++++++
 4 files changed, 34 insertions(+)

diff --git a/CHANGES b/CHANGES
index d5f6b9fc..c3ed0b3e 100644
--- a/CHANGES
+++ b/CHANGES
@@ -22,6 +22,8 @@ Changes
   - Use loguru instead of logging for log (issue #301, PR #303).
 
 Enhancements
+  - "Statistical inefficiency" is logged at debug level for equilibrium_detection
+    and statistical_inefficiency (issue #295, PR#325).
   - Add a parser to read serialised pandas dataframe (parquet) (issue #316, PR#317).
   - workflow.ABFE allow parquet as input (issue #316, PR#317).
   - Allow MBAR estimator to use bootstrap to compute error (issue #320, PR#322).
diff --git a/docs/parsing.rst b/docs/parsing.rst
index d107278e..73de3b68 100644
--- a/docs/parsing.rst
+++ b/docs/parsing.rst
@@ -72,6 +72,9 @@ a :class:`pandas.DataFrame`. For loading alchemlyb data we provide the
     new_u_nk = extract_u_nk('u_nk.parquet', T=300)
     new_dHdl = extract_dHdl('dHdl.parquet', T=300)
 
+.. Note::
+    Serialization of :class:`pandas.DataFrame` to `parquet` file is only allowed
+    for `pandas>=2`, whereas the deserialization is permitted for any pandas version.
 
 .. _dHdl:
 
diff --git a/src/alchemlyb/preprocessing/subsampling.py b/src/alchemlyb/preprocessing/subsampling.py
index ffe39704..4633a87e 100644
--- a/src/alchemlyb/preprocessing/subsampling.py
+++ b/src/alchemlyb/preprocessing/subsampling.py
@@ -7,6 +7,7 @@
 from pymbar.timeseries import detect_equilibration as _detect_equilibration
 from pymbar.timeseries import statistical_inefficiency as _statistical_inefficiency
 from pymbar.timeseries import subsample_correlated_data as _subsample_correlated_data
+from loguru import logger
 
 from .. import pass_attrs
 
@@ -516,12 +517,15 @@ def statistical_inefficiency(
         df = slicing(df, lower=lower, upper=upper, step=step)
 
         # calculate statistical inefficiency of series (could use fft=True but needs test)
+        logger.debug("Running statistical inefficiency analysis.")
         statinef = _statistical_inefficiency(series)
+        logger.debug("Statistical inefficiency: {:.2f}.", statinef)
 
         # use the subsample_correlated_data function to get the subsample index
         indices = _subsample_correlated_data(
             series, g=statinef, conservative=conservative
         )
+        logger.debug("Number of uncorrelated samples: {}.", len(indices))
         df = df.iloc[indices]
     else:
         df = slicing(df, lower=lower, upper=upper, step=step)
@@ -592,12 +596,16 @@ def equilibrium_detection(
         df = slicing(df, lower=lower, upper=upper, step=step)
 
         # calculate statistical inefficiency of series, with equilibrium detection
+        logger.debug("Running equilibration detection.")
         t, statinef, Neff_max = _detect_equilibration(series.values)
+        logger.debug("Start index: {}.", t)
+        logger.debug("Statistical inefficiency: {:.2f}.", statinef)
 
         series_equil = series[t:]
         df_equil = df[t:]
 
         indices = _subsample_correlated_data(series_equil, g=statinef)
+        logger.debug("Number of uncorrelated samples: {}.", len(indices))
         df = df_equil.iloc[indices]
     else:
         df = slicing(df, lower=lower, upper=upper, step=step)
diff --git a/src/alchemlyb/tests/test_preprocessing.py b/src/alchemlyb/tests/test_preprocessing.py
index 766bc3df..00e3c030 100644
--- a/src/alchemlyb/tests/test_preprocessing.py
+++ b/src/alchemlyb/tests/test_preprocessing.py
@@ -1,6 +1,8 @@
 """Tests for preprocessing functions.
 
 """
+import logging
+
 import numpy as np
 import pytest
 from numpy.testing import assert_allclose
@@ -523,3 +525,22 @@ def test_u_nk2series_deprecated(self, u_nk, methodargs, reference):
     def test_other_method_ValueError(self, u_nk):
         with pytest.raises(ValueError, match="Decorrelation method bogus not found."):
             u_nk2series(u_nk, method="bogus")
+
+
+class TestLogging:
+    def test_detect_equilibration(self, caplog, u_nk):
+        with caplog.at_level(logging.DEBUG):
+            decorrelate_u_nk(u_nk, remove_burnin=True)
+
+            assert "Running equilibration detection." in caplog.text
+            assert "Start index:" in caplog.text
+            assert "Statistical inefficiency:" in caplog.text
+            assert "Number of uncorrelated samples:" in caplog.text
+
+    def test_statistical_inefficiency(self, caplog, u_nk):
+        with caplog.at_level(logging.DEBUG):
+            decorrelate_u_nk(u_nk)
+
+            assert "Running statistical inefficiency analysis." in caplog.text
+            assert "Statistical inefficiency:" in caplog.text
+            assert "Number of uncorrelated samples:" in caplog.text