From 5d5cd66e5fd58c3c13f2e0ea2e585ff3c4d4db93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5vard=20Berland?= Date: Tue, 9 Feb 2021 10:29:55 +0100 Subject: [PATCH] Prune cache_eclsum from fmu-ensemble --- src/fmu/ensemble/ensemble.py | 23 ++------------- src/fmu/ensemble/ensembleset.py | 19 ++----------- src/fmu/ensemble/realization.py | 49 +++++--------------------------- tests/test_ensemble.py | 50 --------------------------------- 4 files changed, 11 insertions(+), 130 deletions(-) diff --git a/src/fmu/ensemble/ensemble.py b/src/fmu/ensemble/ensemble.py index b2604d43..f0010b65 100644 --- a/src/fmu/ensemble/ensemble.py +++ b/src/fmu/ensemble/ensemble.py @@ -697,7 +697,6 @@ def load_smry( time_index="raw", column_keys=None, stacked=True, - cache_eclsum=True, start_date=None, end_date=None, include_restart=True, @@ -742,9 +741,6 @@ def load_smry( by vector name, and with realization index as columns. This only works when time_index is the same for all realizations. Not implemented yet! - cache_eclsum (boolean): Boolean for whether we should cache the EclSum - objects. Set to False if you cannot keep all EclSum files in - memory simultaneously start_date (str or date): First date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -772,7 +768,6 @@ def load_smry( realization.load_smry( time_index=time_index, column_keys=column_keys, - cache_eclsum=cache_eclsum, start_date=start_date, end_date=end_date, include_restart=include_restart, @@ -963,7 +958,6 @@ def get_smry_dates( normalize=True, start_date=None, end_date=None, - cache_eclsum=True, include_restart=True, ): """Return list of datetimes for an ensemble according to frequency @@ -999,13 +993,9 @@ def get_smry_dates( # Build list of list of eclsum dates eclsumsdates = [] for _, realization in self.realizations.items(): - if realization.get_eclsum( - cache=cache_eclsum, include_restart=include_restart - ): + if realization.get_eclsum(include_restart=include_restart): eclsumsdates.append( - realization.get_eclsum( - cache=cache_eclsum, include_restart=include_restart - ).dates + realization.get_eclsum(include_restart=include_restart).dates ) return unionize_smry_dates(eclsumsdates, freq, normalize, start_date, end_date) @@ -1014,7 +1004,6 @@ def get_smry_stats( column_keys=None, time_index="monthly", quantiles=None, - cache_eclsum=True, start_date=None, end_date=None, ): @@ -1037,8 +1026,6 @@ def get_smry_stats( to compute. Quantiles refer to scientific standard, which is opposite to the oil industry convention. Ask for p10 if you need the oil industry p90. - cache_eclsum: boolean for whether to keep the loaded EclSum - object in memory after data has been loaded. start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -1070,7 +1057,6 @@ def get_smry_stats( dframe = self.get_smry( time_index=time_index, column_keys=column_keys, - cache_eclsum=cache_eclsum, start_date=start_date, end_date=end_date, ) @@ -1323,7 +1309,6 @@ def get_smry( self, time_index=None, column_keys=None, - cache_eclsum=True, start_date=None, end_date=None, include_restart=True, @@ -1342,9 +1327,6 @@ def get_smry( a wanted frequencey for dates, daily, weekly, monthly, yearly, that will be send to get_smry_dates() column_keys: list of column key wildcards - cache_eclsum: boolean for whether to cache the EclSum - objects. Defaults to True. Set to False if - not enough memory to keep all summary files in memory. start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -1379,7 +1361,6 @@ def get_smry( dframe = realization.get_smry( time_index=time_index, column_keys=column_keys, - cache_eclsum=cache_eclsum, include_restart=include_restart, ) dframe.insert(0, "REAL", index) diff --git a/src/fmu/ensemble/ensembleset.py b/src/fmu/ensemble/ensembleset.py index 83a0af33..44ec2b1b 100644 --- a/src/fmu/ensemble/ensembleset.py +++ b/src/fmu/ensemble/ensembleset.py @@ -571,7 +571,6 @@ def load_smry( self, time_index="raw", column_keys=None, - cache_eclsum=True, start_date=None, end_date=None, ): @@ -595,9 +594,6 @@ def load_smry( If a string is supplied, that string is attempted used via get_smry_dates() in order to obtain a time index. column_keys: list of column key wildcards - cache_eclsum: Boolean for whether we should cache the EclSum - objects. Set to False if you cannot keep all EclSum files in - memory simultaneously start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -616,7 +612,6 @@ def load_smry( ensemble.load_smry( time_index=time_index, column_keys=column_keys, - cache_eclsum=cache_eclsum, start_date=start_date, end_date=end_date, ) @@ -630,7 +625,6 @@ def get_smry( self, time_index=None, column_keys=None, - cache_eclsum=False, start_date=None, end_date=None, ): @@ -646,11 +640,6 @@ def get_smry( If a string is supplied, that string is attempted used via get_smry_dates() in order to obtain a time index. column_keys: list of column key wildcards - cache_eclsum: boolean for whether to cache the EclSum - objects. Defaults to False. Set to True if - there is enough memory to keep all realizations summary - files in memory at once. This will speed up subsequent - operations start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -667,7 +656,7 @@ def get_smry( smrylist = [] for _, ensemble in self._ensembles.items(): smry = ensemble.get_smry( - time_index, column_keys, cache_eclsum, start_date, end_date + time_index, column_keys, start_date, end_date ) smry.insert(0, "ENSEMBLE", ensemble.name) smrylist.append(smry) @@ -676,7 +665,7 @@ def get_smry( return pd.DataFrame() def get_smry_dates( - self, freq="monthly", cache_eclsum=True, start_date=None, end_date=None + self, freq="monthly", start_date=None, end_date=None ): """Return list of datetimes from an ensembleset @@ -689,9 +678,6 @@ def get_smry_dates( yield the sorted union of all valid timesteps for all realizations. Other valid options are 'daily', 'monthly' and 'yearly'. - cache_eclsum: Boolean for whether we should cache the EclSum - objects. Set to False if you cannot keep all EclSum files in - memory simultaneously start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -709,7 +695,6 @@ def get_smry_dates( rawdates = rawdates.union( ensemble.get_smry_dates( freq="report", - cache_eclsum=cache_eclsum, start_date=start_date, end_date=end_date, ) diff --git a/src/fmu/ensemble/realization.py b/src/fmu/ensemble/realization.py index d748df06..c3bf761c 100644 --- a/src/fmu/ensemble/realization.py +++ b/src/fmu/ensemble/realization.py @@ -100,9 +100,6 @@ def __init__( ) self.eclfiles = None # ecl2df.EclFiles object - self._eclsum = None # Placeholder for caching - self._eclsum_include_restart = None # Flag for cached object - # The datastore for internalized data. Dictionary # indexed by filenames (local to the realization). # values in the dictionary can be either dicts or dataframes @@ -894,7 +891,7 @@ def get_eclfiles(self): return None return ecl2df.EclFiles(data_filename) - def get_eclsum(self, cache=True, include_restart=True): + def get_eclsum(self, include_restart=True): """ Fetch the Eclipse Summary file from the realization and return as a libecl EclSum object @@ -908,9 +905,6 @@ def get_eclsum(self, cache=True, include_restart=True): turning off autodiscovery is strongly recommended. Arguments: - cache: boolean indicating whether we should keep an - object reference to the EclSum object. Set to - false if you need to conserve memory. include_restart: boolean sent to libecl for whether restart files should be traversed. @@ -918,10 +912,6 @@ def get_eclsum(self, cache=True, include_restart=True): EclSum: object representing the summary file. None if nothing was found. """ - if cache and self._eclsum: # Return cached object if available - if self._eclsum_include_restart == include_restart: - return self._eclsum - unsmry_file_row = self.files[self.files.FILETYPE == "UNSMRY"] unsmry_filename = None if len(unsmry_file_row) == 1: @@ -952,11 +942,6 @@ def get_eclsum(self, cache=True, include_restart=True): # or if SMSPEC is missing. logger.warning("Failed to create summary instance from %s", unsmry_filename) return None - - if cache: - self._eclsum = eclsum - self._eclsum_include_restart = include_restart - return eclsum def load_smry(self, **kwargs): @@ -994,7 +979,6 @@ def get_smry( self, time_index=None, column_keys=None, - cache_eclsum=True, start_date=None, end_date=None, include_restart=True, @@ -1013,8 +997,6 @@ def get_smry( to these. If a date in ISO-8601 format is supplied, that is used as a single date. column_keys: list of column key wildcards. None means everything. - cache_eclsum: boolean for whether to keep the loaded EclSum - object in memory after data has been loaded. start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -1032,7 +1014,7 @@ def get_smry( if self.get_eclfiles() is None: return pd.DataFrame() try: - dframe = ecl2df.summary.df( + return ecl2df.summary.df( self.get_eclfiles(), time_index=time_index, column_keys=column_keys, @@ -1043,17 +1025,6 @@ def get_smry( paramfile=None, datetime=datetimeindex, ) - if cache_eclsum: - if self.get_eclfiles(): - # This is necessary for tests to pass, but might not - # be the way to do it since ecl2df should take full - # responsibility for the eclsum objects. - self._eclsum = self.get_eclfiles().get_eclsum() - else: - # Do this to ensure that we cut the rope to the EclSum object - # Can be critical for garbage collection - self._eclsum = None - return dframe except OSError: # Missing or bogus UNSMRY file return pd.DataFrame() @@ -1121,7 +1092,7 @@ def _glob_smry_keys(self, column_keys): keys = set() for key in column_keys: if isinstance(key, str): - keys = keys.union(set(self._eclsum.keys(key))) + keys = keys.union(set(self.get_eclsum().keys(key))) return list(keys) def get_volumetric_rates(self, column_keys=None, time_index=None, time_unit=None): @@ -1142,24 +1113,18 @@ def get_smryvalues(self, props_wildcard=None): a dataframe with values. Raw times from UNSMRY. Empty dataframe if no summary file data available """ - if not self._eclsum: # check if it is cached - self.get_eclsum() - - if not self._eclsum: - return pd.DataFrame() - props = self._glob_smry_keys(props_wildcard) - if "numpy_vector" in dir(self._eclsum): + if "numpy_vector" in dir(self.get_eclsum()): data = { - prop: self._eclsum.numpy_vector(prop, report_only=False) + prop: self.get_eclsum().numpy_vector(prop, report_only=False) for prop in props } else: # get_values() is deprecated in newer libecl data = { - prop: self._eclsum.get_values(prop, report_only=False) for prop in props + prop: self.get_eclsum().get_values(prop, report_only=False) for prop in props } - dates = self._eclsum.get_dates(report_only=False) + dates = self.get_eclsum().get_dates(report_only=False) return pd.DataFrame(data=data, index=dates) def get_smry_dates( diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py index 75552dd2..1bb7d4db 100644 --- a/tests/test_ensemble.py +++ b/tests/test_ensemble.py @@ -828,56 +828,6 @@ def test_nonexisting(): assert not nopermission -def test_eclsumcaching(): - """Test caching of eclsum""" - - if "__file__" in globals(): - # Easen up copying test code into interactive sessions - testdir = os.path.dirname(os.path.abspath(__file__)) - else: - testdir = os.path.abspath(".") - - dirs = testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" - ens = ScratchEnsemble("reektest", dirs) - - # The problem here is if you load in a lot of UNSMRY files - # and the Python process keeps them in memory. Not sure - # how to check in code that an object has been garbage collected - # but for garbage collection to work, at least the realization - # _eclsum variable must be None. - - ens.load_smry() - # Default is to do caching, so these will not be None: - assert all([x._eclsum for (idx, x) in ens.realizations.items()]) - - # If we redo this operation, the same objects should all - # be None afterwards: - ens.load_smry(cache_eclsum=None) - assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry() - assert all([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry(cache_eclsum=False) - assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry_stats() - assert all([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry_stats(cache_eclsum=False) - assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry_dates() - assert all([x._eclsum for (idx, x) in ens.realizations.items()]) - - # Clear the cached objects because the statement above has cached it.. - for _, realization in ens.realizations.items(): - realization._eclsum = None - - ens.get_smry_dates(cache_eclsum=False) - assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) - - def test_filedescriptors(): """Test how filedescriptors are used.