From 5d5cd66e5fd58c3c13f2e0ea2e585ff3c4d4db93 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?H=C3=A5vard=20Berland?= <havb@equinor.com>
Date: Tue, 9 Feb 2021 10:29:55 +0100
Subject: [PATCH] Prune cache_eclsum from fmu-ensemble

---
 src/fmu/ensemble/ensemble.py    | 23 ++-------------
 src/fmu/ensemble/ensembleset.py | 19 ++-----------
 src/fmu/ensemble/realization.py | 49 +++++---------------------------
 tests/test_ensemble.py          | 50 ---------------------------------
 4 files changed, 11 insertions(+), 130 deletions(-)

diff --git a/src/fmu/ensemble/ensemble.py b/src/fmu/ensemble/ensemble.py
index b2604d43..f0010b65 100644
--- a/src/fmu/ensemble/ensemble.py
+++ b/src/fmu/ensemble/ensemble.py
@@ -697,7 +697,6 @@ def load_smry(
         time_index="raw",
         column_keys=None,
         stacked=True,
-        cache_eclsum=True,
         start_date=None,
         end_date=None,
         include_restart=True,
@@ -742,9 +741,6 @@ def load_smry(
                 by vector name, and with realization index as columns.
                 This only works when time_index is the same for all
                 realizations. Not implemented yet!
-            cache_eclsum (boolean): Boolean for whether we should cache the EclSum
-                objects. Set to False if you cannot keep all EclSum files in
-                memory simultaneously
             start_date (str or date): First date to include.
                 Dates prior to this date will be dropped, supplied
                 start_date will always be included. Overridden if time_index
@@ -772,7 +768,6 @@ def load_smry(
             realization.load_smry(
                 time_index=time_index,
                 column_keys=column_keys,
-                cache_eclsum=cache_eclsum,
                 start_date=start_date,
                 end_date=end_date,
                 include_restart=include_restart,
@@ -963,7 +958,6 @@ def get_smry_dates(
         normalize=True,
         start_date=None,
         end_date=None,
-        cache_eclsum=True,
         include_restart=True,
     ):
         """Return list of datetimes for an ensemble according to frequency
@@ -999,13 +993,9 @@ def get_smry_dates(
         # Build list of list of eclsum dates
         eclsumsdates = []
         for _, realization in self.realizations.items():
-            if realization.get_eclsum(
-                cache=cache_eclsum, include_restart=include_restart
-            ):
+            if realization.get_eclsum(include_restart=include_restart):
                 eclsumsdates.append(
-                    realization.get_eclsum(
-                        cache=cache_eclsum, include_restart=include_restart
-                    ).dates
+                    realization.get_eclsum(include_restart=include_restart).dates
                 )
         return unionize_smry_dates(eclsumsdates, freq, normalize, start_date, end_date)
 
@@ -1014,7 +1004,6 @@ def get_smry_stats(
         column_keys=None,
         time_index="monthly",
         quantiles=None,
-        cache_eclsum=True,
         start_date=None,
         end_date=None,
     ):
@@ -1037,8 +1026,6 @@ def get_smry_stats(
                to compute. Quantiles refer to scientific standard, which
                is opposite to the oil industry convention.
                Ask for p10 if you need the oil industry p90.
-            cache_eclsum: boolean for whether to keep the loaded EclSum
-                object in memory after data has been loaded.
             start_date: str or date with first date to include.
                 Dates prior to this date will be dropped, supplied
                 start_date will always be included. Overridden if time_index
@@ -1070,7 +1057,6 @@ def get_smry_stats(
         dframe = self.get_smry(
             time_index=time_index,
             column_keys=column_keys,
-            cache_eclsum=cache_eclsum,
             start_date=start_date,
             end_date=end_date,
         )
@@ -1323,7 +1309,6 @@ def get_smry(
         self,
         time_index=None,
         column_keys=None,
-        cache_eclsum=True,
         start_date=None,
         end_date=None,
         include_restart=True,
@@ -1342,9 +1327,6 @@ def get_smry(
                a wanted frequencey for dates, daily, weekly, monthly, yearly,
                that will be send to get_smry_dates()
             column_keys: list of column key wildcards
-            cache_eclsum: boolean for whether to cache the EclSum
-                objects. Defaults to True. Set to False if
-                not enough memory to keep all summary files in memory.
             start_date: str or date with first date to include.
                 Dates prior to this date will be dropped, supplied
                 start_date will always be included. Overridden if time_index
@@ -1379,7 +1361,6 @@ def get_smry(
             dframe = realization.get_smry(
                 time_index=time_index,
                 column_keys=column_keys,
-                cache_eclsum=cache_eclsum,
                 include_restart=include_restart,
             )
             dframe.insert(0, "REAL", index)
diff --git a/src/fmu/ensemble/ensembleset.py b/src/fmu/ensemble/ensembleset.py
index 83a0af33..44ec2b1b 100644
--- a/src/fmu/ensemble/ensembleset.py
+++ b/src/fmu/ensemble/ensembleset.py
@@ -571,7 +571,6 @@ def load_smry(
         self,
         time_index="raw",
         column_keys=None,
-        cache_eclsum=True,
         start_date=None,
         end_date=None,
     ):
@@ -595,9 +594,6 @@ def load_smry(
                If a string is supplied, that string is attempted used
                via get_smry_dates() in order to obtain a time index.
             column_keys: list of column key wildcards
-            cache_eclsum: Boolean for whether we should cache the EclSum
-                objects. Set to False if you cannot keep all EclSum files in
-                memory simultaneously
             start_date: str or date with first date to include.
                 Dates prior to this date will be dropped, supplied
                 start_date will always be included. Overridden if time_index
@@ -616,7 +612,6 @@ def load_smry(
             ensemble.load_smry(
                 time_index=time_index,
                 column_keys=column_keys,
-                cache_eclsum=cache_eclsum,
                 start_date=start_date,
                 end_date=end_date,
             )
@@ -630,7 +625,6 @@ def get_smry(
         self,
         time_index=None,
         column_keys=None,
-        cache_eclsum=False,
         start_date=None,
         end_date=None,
     ):
@@ -646,11 +640,6 @@ def get_smry(
                If a string is supplied, that string is attempted used
                via get_smry_dates() in order to obtain a time index.
             column_keys: list of column key wildcards
-            cache_eclsum: boolean for whether to cache the EclSum
-                objects. Defaults to False. Set to True if
-                there is enough memory to keep all realizations summary
-                files in memory at once. This will speed up subsequent
-                operations
             start_date: str or date with first date to include.
                 Dates prior to this date will be dropped, supplied
                 start_date will always be included. Overridden if time_index
@@ -667,7 +656,7 @@ def get_smry(
         smrylist = []
         for _, ensemble in self._ensembles.items():
             smry = ensemble.get_smry(
-                time_index, column_keys, cache_eclsum, start_date, end_date
+                time_index, column_keys, start_date, end_date
             )
             smry.insert(0, "ENSEMBLE", ensemble.name)
             smrylist.append(smry)
@@ -676,7 +665,7 @@ def get_smry(
         return pd.DataFrame()
 
     def get_smry_dates(
-        self, freq="monthly", cache_eclsum=True, start_date=None, end_date=None
+        self, freq="monthly", start_date=None, end_date=None
     ):
         """Return list of datetimes from an ensembleset
 
@@ -689,9 +678,6 @@ def get_smry_dates(
                yield the sorted union of all valid timesteps for
                all realizations. Other valid options are
                'daily', 'monthly' and 'yearly'.
-            cache_eclsum: Boolean for whether we should cache the EclSum
-                objects. Set to False if you cannot keep all EclSum files in
-                memory simultaneously
             start_date: str or date with first date to include.
                 Dates prior to this date will be dropped, supplied
                 start_date will always be included. Overridden if time_index
@@ -709,7 +695,6 @@ def get_smry_dates(
             rawdates = rawdates.union(
                 ensemble.get_smry_dates(
                     freq="report",
-                    cache_eclsum=cache_eclsum,
                     start_date=start_date,
                     end_date=end_date,
                 )
diff --git a/src/fmu/ensemble/realization.py b/src/fmu/ensemble/realization.py
index d748df06..c3bf761c 100644
--- a/src/fmu/ensemble/realization.py
+++ b/src/fmu/ensemble/realization.py
@@ -100,9 +100,6 @@ def __init__(
         )
         self.eclfiles = None  # ecl2df.EclFiles object
 
-        self._eclsum = None  # Placeholder for caching
-        self._eclsum_include_restart = None  # Flag for cached object
-
         # The datastore for internalized data. Dictionary
         # indexed by filenames (local to the realization).
         # values in the dictionary can be either dicts or dataframes
@@ -894,7 +891,7 @@ def get_eclfiles(self):
                 return None
         return ecl2df.EclFiles(data_filename)
 
-    def get_eclsum(self, cache=True, include_restart=True):
+    def get_eclsum(self, include_restart=True):
         """
         Fetch the Eclipse Summary file from the realization
         and return as a libecl EclSum object
@@ -908,9 +905,6 @@ def get_eclsum(self, cache=True, include_restart=True):
         turning off autodiscovery is strongly recommended.
 
         Arguments:
-            cache: boolean indicating whether we should keep an
-                object reference to the EclSum object. Set to
-                false if you need to conserve memory.
             include_restart: boolean sent to libecl for whether restart
                 files should be traversed.
 
@@ -918,10 +912,6 @@ def get_eclsum(self, cache=True, include_restart=True):
             EclSum: object representing the summary file. None if
                 nothing was found.
         """
-        if cache and self._eclsum:  # Return cached object if available
-            if self._eclsum_include_restart == include_restart:
-                return self._eclsum
-
         unsmry_file_row = self.files[self.files.FILETYPE == "UNSMRY"]
         unsmry_filename = None
         if len(unsmry_file_row) == 1:
@@ -952,11 +942,6 @@ def get_eclsum(self, cache=True, include_restart=True):
             # or if SMSPEC is missing.
             logger.warning("Failed to create summary instance from %s", unsmry_filename)
             return None
-
-        if cache:
-            self._eclsum = eclsum
-            self._eclsum_include_restart = include_restart
-
         return eclsum
 
     def load_smry(self, **kwargs):
@@ -994,7 +979,6 @@ def get_smry(
         self,
         time_index=None,
         column_keys=None,
-        cache_eclsum=True,
         start_date=None,
         end_date=None,
         include_restart=True,
@@ -1013,8 +997,6 @@ def get_smry(
                to these. If a date in ISO-8601 format is supplied, that is
                used as a single date.
             column_keys: list of column key wildcards. None means everything.
-            cache_eclsum: boolean for whether to keep the loaded EclSum
-                object in memory after data has been loaded.
             start_date: str or date with first date to include.
                 Dates prior to this date will be dropped, supplied
                 start_date will always be included. Overridden if time_index
@@ -1032,7 +1014,7 @@ def get_smry(
         if self.get_eclfiles() is None:
             return pd.DataFrame()
         try:
-            dframe = ecl2df.summary.df(
+            return ecl2df.summary.df(
                 self.get_eclfiles(),
                 time_index=time_index,
                 column_keys=column_keys,
@@ -1043,17 +1025,6 @@ def get_smry(
                 paramfile=None,
                 datetime=datetimeindex,
             )
-            if cache_eclsum:
-                if self.get_eclfiles():
-                    # This is necessary for tests to pass, but might not
-                    # be the way to do it since ecl2df should take full
-                    # responsibility for the eclsum objects.
-                    self._eclsum = self.get_eclfiles().get_eclsum()
-            else:
-                # Do this to ensure that we cut the rope to the EclSum object
-                # Can be critical for garbage collection
-                self._eclsum = None
-            return dframe
         except OSError:
             # Missing or bogus UNSMRY file
             return pd.DataFrame()
@@ -1121,7 +1092,7 @@ def _glob_smry_keys(self, column_keys):
         keys = set()
         for key in column_keys:
             if isinstance(key, str):
-                keys = keys.union(set(self._eclsum.keys(key)))
+                keys = keys.union(set(self.get_eclsum().keys(key)))
         return list(keys)
 
     def get_volumetric_rates(self, column_keys=None, time_index=None, time_unit=None):
@@ -1142,24 +1113,18 @@ def get_smryvalues(self, props_wildcard=None):
             a dataframe with values. Raw times from UNSMRY.
             Empty dataframe if no summary file data available
         """
-        if not self._eclsum:  # check if it is cached
-            self.get_eclsum()
-
-        if not self._eclsum:
-            return pd.DataFrame()
-
         props = self._glob_smry_keys(props_wildcard)
 
-        if "numpy_vector" in dir(self._eclsum):
+        if "numpy_vector" in dir(self.get_eclsum()):
             data = {
-                prop: self._eclsum.numpy_vector(prop, report_only=False)
+                prop: self.get_eclsum().numpy_vector(prop, report_only=False)
                 for prop in props
             }
         else:  # get_values() is deprecated in newer libecl
             data = {
-                prop: self._eclsum.get_values(prop, report_only=False) for prop in props
+                prop: self.get_eclsum().get_values(prop, report_only=False) for prop in props
             }
-        dates = self._eclsum.get_dates(report_only=False)
+        dates = self.get_eclsum().get_dates(report_only=False)
         return pd.DataFrame(data=data, index=dates)
 
     def get_smry_dates(
diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
index 75552dd2..1bb7d4db 100644
--- a/tests/test_ensemble.py
+++ b/tests/test_ensemble.py
@@ -828,56 +828,6 @@ def test_nonexisting():
     assert not nopermission
 
 
-def test_eclsumcaching():
-    """Test caching of eclsum"""
-
-    if "__file__" in globals():
-        # Easen up copying test code into interactive sessions
-        testdir = os.path.dirname(os.path.abspath(__file__))
-    else:
-        testdir = os.path.abspath(".")
-
-    dirs = testdir + "/data/testensemble-reek001/" + "realization-*/iter-0"
-    ens = ScratchEnsemble("reektest", dirs)
-
-    # The problem here is if you load in a lot of UNSMRY files
-    # and the Python process keeps them in memory. Not sure
-    # how to check in code that an object has been garbage collected
-    # but for garbage collection to work, at least the realization
-    # _eclsum variable must be None.
-
-    ens.load_smry()
-    # Default is to do caching, so these will not be None:
-    assert all([x._eclsum for (idx, x) in ens.realizations.items()])
-
-    # If we redo this operation, the same objects should all
-    # be None afterwards:
-    ens.load_smry(cache_eclsum=None)
-    assert not any([x._eclsum for (idx, x) in ens.realizations.items()])
-
-    ens.get_smry()
-    assert all([x._eclsum for (idx, x) in ens.realizations.items()])
-
-    ens.get_smry(cache_eclsum=False)
-    assert not any([x._eclsum for (idx, x) in ens.realizations.items()])
-
-    ens.get_smry_stats()
-    assert all([x._eclsum for (idx, x) in ens.realizations.items()])
-
-    ens.get_smry_stats(cache_eclsum=False)
-    assert not any([x._eclsum for (idx, x) in ens.realizations.items()])
-
-    ens.get_smry_dates()
-    assert all([x._eclsum for (idx, x) in ens.realizations.items()])
-
-    # Clear the cached objects because the statement above has cached it..
-    for _, realization in ens.realizations.items():
-        realization._eclsum = None
-
-    ens.get_smry_dates(cache_eclsum=False)
-    assert not any([x._eclsum for (idx, x) in ens.realizations.items()])
-
-
 def test_filedescriptors():
     """Test how filedescriptors are used.