From fc4395fa73a430628589268f7574b79af066b8c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5vard=20Berland?= Date: Thu, 19 Nov 2020 08:06:12 +0100 Subject: [PATCH] API change: Refactor to use ecl2df for summary file extraction * realization.get_smry() has changed to always return a dummy index * cache_eclsum is pruned from fmu-ensemble. --- setup.cfg | 3 - setup.py | 1 + src/fmu/ensemble/ensemble.py | 85 +------ src/fmu/ensemble/ensembleset.py | 52 +---- src/fmu/ensemble/observations.py | 6 +- src/fmu/ensemble/realization.py | 295 ++++++++----------------- src/fmu/ensemble/util/rates.py | 4 +- src/fmu/ensemble/virtualensemble.py | 5 - src/fmu/ensemble/virtualrealization.py | 12 +- tests/test_ecl2df.py | 15 +- tests/test_ensemble.py | 56 +---- tests/test_observations.py | 102 +++++++-- tests/test_realization.py | 47 ++-- tests/test_virtualrealization.py | 6 +- 14 files changed, 227 insertions(+), 462 deletions(-) diff --git a/setup.cfg b/setup.cfg index 598af0d7..86b20dee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,9 +9,6 @@ exclude = docs, [aliases] test = pytest -[tool:pytest] -addopts = --verbose -x - [build_sphinx] all-files = 1 warning-is-error = 1 diff --git a/setup.py b/setup.py index 0e69fd7b..fc245422 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ REQUIREMENTS = [ "ecl>=2.9", + "ecl2df", "numpy", "pandas", "pyyaml>=5.1", diff --git a/src/fmu/ensemble/ensemble.py b/src/fmu/ensemble/ensemble.py index fc778604..fd19a891 100644 --- a/src/fmu/ensemble/ensemble.py +++ b/src/fmu/ensemble/ensemble.py @@ -689,16 +689,13 @@ def get_df(self, localpath, merge=None): # the realization index, and end up in a MultiIndex dframe = pd.concat(dflist, sort=False).reset_index() dframe.rename(columns={"level_0": "REAL"}, inplace=True) - del dframe["level_1"] # This is the indices from each real - return dframe + return dframe.drop("level_1", axis="columns", errors="ignore") raise KeyError("No data found for " + localpath) def load_smry( self, time_index="raw", column_keys=None, - stacked=None, - cache_eclsum=None, start_date=None, end_date=None, include_restart=True, @@ -743,9 +740,6 @@ def load_smry( by vector name, and with realization index as columns. This only works when time_index is the same for all realizations. Not implemented yet! - cache_eclsum (boolean): Boolean for whether we should cache the EclSum - objects. Set to False if you cannot keep all EclSum files in - memory simultaneously start_date (str or date): First date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -761,28 +755,6 @@ def load_smry( pd.DataFame: Summary vectors for the ensemble, or a dict of dataframes if stacked=False. """ - if stacked is not None: - warnings.warn( - ( - "stacked option to load_smry() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - else: - stacked = True - if not stacked: - raise NotImplementedError - - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to load_smry() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - # Future: Multithread this! for realidx, realization in self.realizations.items(): # We do not store the returned DataFrames here, @@ -793,7 +765,6 @@ def load_smry( realization.load_smry( time_index=time_index, column_keys=column_keys, - cache_eclsum=cache_eclsum, start_date=start_date, end_date=end_date, include_restart=include_restart, @@ -984,7 +955,6 @@ def get_smry_dates( normalize=True, start_date=None, end_date=None, - cache_eclsum=None, include_restart=True, ): """Return list of datetimes for an ensemble according to frequency @@ -1016,28 +986,12 @@ def get_smry_dates( Returns: list of datetimes. Empty list if no data found. """ - - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to get_smry_dates() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - else: - cache_eclsum = True - # Build list of list of eclsum dates eclsumsdates = [] for _, realization in self.realizations.items(): - if realization.get_eclsum( - cache=cache_eclsum, include_restart=include_restart - ): + if realization.get_eclsum(include_restart=include_restart): eclsumsdates.append( - realization.get_eclsum( - cache=cache_eclsum, include_restart=include_restart - ).dates + realization.get_eclsum(include_restart=include_restart).dates ) return unionize_smry_dates(eclsumsdates, freq, normalize, start_date, end_date) @@ -1046,7 +1000,6 @@ def get_smry_stats( column_keys=None, time_index="monthly", quantiles=None, - cache_eclsum=None, start_date=None, end_date=None, ): @@ -1069,8 +1022,6 @@ def get_smry_stats( to compute. Quantiles refer to scientific standard, which is opposite to the oil industry convention. Ask for p10 if you need the oil industry p90. - cache_eclsum: boolean for whether to keep the loaded EclSum - object in memory after data has been loaded. start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -1088,15 +1039,6 @@ def get_smry_stats( strings in the outer index are changed accordingly. If no data is found, return empty DataFrame. """ - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to get_smry_stats() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - if quantiles is None: quantiles = [10, 90] @@ -1111,7 +1053,6 @@ def get_smry_stats( dframe = self.get_smry( time_index=time_index, column_keys=column_keys, - cache_eclsum=cache_eclsum, start_date=start_date, end_date=end_date, ) @@ -1377,7 +1318,6 @@ def get_smry( self, time_index=None, column_keys=None, - cache_eclsum=None, start_date=None, end_date=None, include_restart=True, @@ -1388,6 +1328,9 @@ def get_smry( Wraps around Realization.get_smry() which wraps around ecl.summary.EclSum.pandas_frame() + The returned dataframe will always have a dummy index, and + DATE and REAL as columns. + Args: time_index: list of DateTime if interpolation is wanted default is None, which returns the raw Eclipse report times @@ -1396,9 +1339,6 @@ def get_smry( a wanted frequencey for dates, daily, weekly, monthly, yearly, that will be send to get_smry_dates() column_keys: list of column key wildcards - cache_eclsum: boolean for whether to cache the EclSum - objects. Defaults to True. Set to False if - not enough memory to keep all summary files in memory. start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -1415,15 +1355,6 @@ def get_smry( REAL with integers is added to distinguish realizations. If no realizations, empty DataFrame is returned. """ - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to get_smry() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - if isinstance(time_index, str): # Try interpreting as ISO-date: try: @@ -1442,14 +1373,12 @@ def get_smry( dframe = realization.get_smry( time_index=time_index, column_keys=column_keys, - cache_eclsum=cache_eclsum, include_restart=include_restart, ) dframe.insert(0, "REAL", index) - dframe.index.name = "DATE" dflist.append(dframe) if dflist: - return pd.concat(dflist, sort=False).reset_index() + return pd.concat(dflist, sort=False) return pd.DataFrame() def get_eclgrid(self, props, report=0, agg="mean", active_only=False): diff --git a/src/fmu/ensemble/ensembleset.py b/src/fmu/ensemble/ensembleset.py index e825b0d4..911c3eb6 100644 --- a/src/fmu/ensemble/ensembleset.py +++ b/src/fmu/ensemble/ensembleset.py @@ -572,7 +572,6 @@ def load_smry( self, time_index="raw", column_keys=None, - cache_eclsum=None, start_date=None, end_date=None, ): @@ -596,9 +595,6 @@ def load_smry( If a string is supplied, that string is attempted used via get_smry_dates() in order to obtain a time index. column_keys: list of column key wildcards - cache_eclsum: Boolean for whether we should cache the EclSum - objects. Set to False if you cannot keep all EclSum files in - memory simultaneously start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -612,21 +608,11 @@ def load_smry( A DataFame of summary vectors for the ensembleset. The column 'ENSEMBLE' will denote each ensemble's name """ - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to load_smry() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - # Future: Multithread this: for _, ensemble in self._ensembles.items(): ensemble.load_smry( time_index=time_index, column_keys=column_keys, - cache_eclsum=cache_eclsum, start_date=start_date, end_date=end_date, ) @@ -640,7 +626,6 @@ def get_smry( self, time_index=None, column_keys=None, - cache_eclsum=None, start_date=None, end_date=None, ): @@ -656,11 +641,6 @@ def get_smry( If a string is supplied, that string is attempted used via get_smry_dates() in order to obtain a time index. column_keys: list of column key wildcards - cache_eclsum: boolean for whether to cache the EclSum - objects. Defaults to False. Set to True if - there is enough memory to keep all realizations summary - files in memory at once. This will speed up subsequent - operations start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -674,30 +654,16 @@ def get_smry( ENSEMBLE will distinguish the different ensembles by their respective names. """ - - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to get_smry() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - smrylist = [] for _, ensemble in self._ensembles.items(): - smry = ensemble.get_smry( - time_index, column_keys, cache_eclsum, start_date, end_date - ) + smry = ensemble.get_smry(time_index, column_keys, start_date, end_date) smry.insert(0, "ENSEMBLE", ensemble.name) smrylist.append(smry) if smrylist: return pd.concat(smrylist, sort=False) return pd.DataFrame() - def get_smry_dates( - self, freq="monthly", cache_eclsum=None, start_date=None, end_date=None - ): + def get_smry_dates(self, freq="monthly", start_date=None, end_date=None): """Return list of datetimes from an ensembleset Datetimes from each realization in each ensemble can @@ -709,9 +675,6 @@ def get_smry_dates( yield the sorted union of all valid timesteps for all realizations. Other valid options are 'daily', 'monthly' and 'yearly'. - cache_eclsum: Boolean for whether we should cache the EclSum - objects. Set to False if you cannot keep all EclSum files in - memory simultaneously start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -723,22 +686,11 @@ def get_smry_dates( Returns: list of datetime.date. """ - - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to get_smry_dates() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - rawdates = set() for _, ensemble in self._ensembles.items(): rawdates = rawdates.union( ensemble.get_smry_dates( freq="report", - cache_eclsum=cache_eclsum, start_date=start_date, end_date=end_date, ) diff --git a/src/fmu/ensemble/observations.py b/src/fmu/ensemble/observations.py index 0492310a..af453c49 100644 --- a/src/fmu/ensemble/observations.py +++ b/src/fmu/ensemble/observations.py @@ -1,6 +1,4 @@ -""" -Observations support and related calculations -""" +"""Observations support and related calculations""" import os import math @@ -174,7 +172,7 @@ def load_smry(self, realization, smryvector, time_index="yearly", smryerror=None """ dataseries = realization.get_smry( column_keys=[smryvector], time_index=time_index - )[smryvector] + )[["DATE", smryvector]].set_index("DATE")[smryvector] # In the context of this function, datetimes are not supported. Ensure dates: if isinstance(dataseries.index, pd.DatetimeIndex): diff --git a/src/fmu/ensemble/realization.py b/src/fmu/ensemble/realization.py index 561494da..4aa2fa99 100644 --- a/src/fmu/ensemble/realization.py +++ b/src/fmu/ensemble/realization.py @@ -26,20 +26,14 @@ from ecl.grid import EclGrid from ecl import EclFileFlagEnum +import ecl2df + from .virtualrealization import VirtualRealization from .realizationcombination import RealizationCombination from .util import parse_number, flatten, shortcut2path from .util.rates import compute_volumetric_rates from .util.dates import unionize_smry_dates -HAVE_ECL2DF = False -try: - import ecl2df - - HAVE_ECL2DF = True -except ImportError: - HAVE_ECL2DF = False - logger = logging.getLogger(__name__) @@ -105,8 +99,7 @@ def __init__( self.files = pd.DataFrame( columns=["FULLPATH", "FILETYPE", "LOCALPATH", "BASENAME"] ) - self._eclsum = None # Placeholder for caching - self._eclsum_include_restart = None # Flag for cached object + self.eclfiles = None # ecl2df.EclFiles object # The datastore for internalized data. Dictionary # indexed by filenames (local to the realization). @@ -852,18 +845,21 @@ def get_eclfiles(self): Returns: ecl2df.EclFiles. None if nothing found """ - if not HAVE_ECL2DF: - logger.warning("ecl2df not installed. Skipping") - return None - data_file_row = self.files[self.files["FILETYPE"] == "DATA"] + data_file_rows = self.files[self.files["FILETYPE"] == "DATA"] data_filename = None - if len(data_file_row) == 1: - data_filename = data_file_row["FULLPATH"].values[0] + unsmry_file_rows = self.files[self.files["FILETYPE"] == "UNSMRY"] + unsmry_filename = None + if len(data_file_rows) == 1: + data_filename = data_file_rows["FULLPATH"].values[0] + elif len(unsmry_file_rows) == 1: + unsmry_filename = unsmry_file_rows["FULLPATH"].values[0] + # We construct the DATA file, even though it might not exist: + data_filename = unsmry_filename.replace(".UNSMRY", ".DATA") elif self._autodiscovery: data_fileguess = os.path.join(self._origpath, "eclipse/model", "*.DATA") data_filenamelist = glob.glob(data_fileguess) if not data_filenamelist: - return None # No filename matches *DATA + return None # No filename matches *DATA or *UNSMRY if len(data_filenamelist) > 1: logger.warning( ( @@ -871,17 +867,32 @@ def get_eclfiles(self): "consider turning off auto-discovery" ) ) - data_filename = data_filenamelist[0] - self.find_files(data_filename) + if data_filenamelist: + data_filename = data_filenamelist[0] + self.find_files(data_filename) + + unsmry_fileguess = os.path.join(self._origpath, "eclipse/model", "*.UNSMRY") + unsmry_filenamelist = glob.glob(unsmry_fileguess) + if not unsmry_filenamelist: + return None # No filename matches + if len(unsmry_filenamelist) > 1: + logger.warning( + "Multiple UNSMRY files found, consider turning off auto-discovery" + ) + unsmry_filename = unsmry_filenamelist[0] + self.find_files(unsmry_filename) + else: - # There is no DATA file to be found. - logger.warning("No DATA file found!") + logger.warning("No DATA and/or UNSMRY file found!") return None if not os.path.exists(data_filename): - return None + if unsmry_filename is not None: + return ecl2df.EclFiles(unsmry_filename.replace(".UNSMRY", ".DATA")) + else: + return None return ecl2df.EclFiles(data_filename) - def get_eclsum(self, cache=True, include_restart=True): + def get_eclsum(self, include_restart=True): """ Fetch the Eclipse Summary file from the realization and return as a libecl EclSum object @@ -895,9 +906,6 @@ def get_eclsum(self, cache=True, include_restart=True): turning off autodiscovery is strongly recommended. Arguments: - cache: boolean indicating whether we should keep an - object reference to the EclSum object. Set to - false if you need to conserve memory. include_restart: boolean sent to libecl for whether restart files should be traversed. @@ -905,10 +913,6 @@ def get_eclsum(self, cache=True, include_restart=True): EclSum: object representing the summary file. None if nothing was found. """ - if cache and self._eclsum: # Return cached object if available - if self._eclsum_include_restart == include_restart: - return self._eclsum - unsmry_file_row = self.files[self.files.FILETYPE == "UNSMRY"] unsmry_filename = None if len(unsmry_file_row) == 1: @@ -939,136 +943,52 @@ def get_eclsum(self, cache=True, include_restart=True): # or if SMSPEC is missing. logger.warning("Failed to create summary instance from %s", unsmry_filename) return None - - if cache: - self._eclsum = eclsum - self._eclsum_include_restart = include_restart - return eclsum - def load_smry( - self, - time_index="raw", - column_keys=None, - cache_eclsum=None, - start_date=None, - end_date=None, - include_restart=True, - ): - """Produce dataframe from Summary data from the realization - - When this function is called, the dataframe will be - internalized. Internalization of summary data in a - realization object supports different time_index, but there is - no handling of multiple sets of column_keys. The cached data - will be called - - 'share/results/tables/unsmry--.csv' - - where is among 'yearly', 'monthly', 'daily', 'first', - 'last' or 'raw' (meaning the raw dates in the SMRY file), depending - on the chosen time_index. If a custom time_index (list - of datetime) was supplied, will be called 'custom'. - - Wraps ecl.summary.EclSum.pandas_frame() - - See also get_smry() - - Args: - time_index: string indicating a resampling frequency, - 'yearly', 'monthly', 'daily', 'first', 'last' or 'raw', the - latter will return the simulated report steps (also default). - If a list of DateTime is supplied, data will be resampled - to these. - column_keys: list of column key wildcards. None means everything. - cache_eclsum: boolean for whether to keep the loaded EclSum - object in memory after data has been loaded. - start_date: str or date with first date to include. - Dates prior to this date will be dropped, supplied - start_date will always be included. Overridden if time_index - is 'first' or 'last'. - end_date: str or date with last date to be included. - Dates past this date will be dropped, supplied - end_date will always be included. Overridden if time_index - is 'first' or 'last'. - include_restart: boolean sent to libecl for whether restart - files should be traversed. - - Returns: - DataFrame with summary keys as columns and dates as indices. - Empty dataframe if no summary is available or column - keys do not exist. - DataFrame: with summary keys as columns and dates as indices. - Empty dataframe if no summary is available. - """ - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to load_smry() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - else: - cache_eclsum = True - - if not self.get_eclsum(cache=cache_eclsum): - # Return empty, but do not store the empty dataframe in self.data - return pd.DataFrame() - time_index_path = time_index - if time_index == "raw": - time_index_arg = None - elif isinstance(time_index, str): - # Note: This call will recache the smry object. - time_index_arg = self.get_smry_dates( - freq=time_index, - start_date=start_date, - end_date=end_date, - include_restart=include_restart, - ) - elif isinstance(time_index, (list, np.ndarray)): - time_index_arg = time_index - time_index_path = "custom" - elif time_index is None: - time_index_path = "raw" - time_index_arg = time_index - else: - raise TypeError("'time_index' has to be a string, a list or None") - - if not isinstance(column_keys, list): - column_keys = [column_keys] - - # Do the actual work: - dframe = self.get_eclsum( - cache=cache_eclsum, include_restart=include_restart - ).pandas_frame(time_index_arg, column_keys) - dframe = dframe.reset_index() - dframe.rename(columns={"index": "DATE"}, inplace=True) - - # Cache the result: - localpath = "share/results/tables/unsmry--" + time_index_path + ".csv" - self.data[localpath] = dframe - - # Do this to ensure that we cut the rope to the EclSum object - # Can be critical for garbage collection - if not cache_eclsum: - self._eclsum = None + def load_smry(self, **kwargs): + """Wrap around get_smry(), but also cache the result""" + dframe = self.get_smry(**kwargs) + + cachename = None + # Cache the result for supported time indices: + if "time_index" not in kwargs or kwargs["time_index"] is None: + cachename = "raw" + elif isinstance(kwargs["time_index"], list): + cachename = "custom" + elif str(kwargs["time_index"]) in [ + "raw", + "first", + "last", + "report", + "daily", + "weekly", + "monthly", + "yearly", + ]: + cachename = kwargs["time_index"] + + if cachename: + localpath = "share/results/tables/unsmry--" + cachename + ".csv" + self.data[localpath] = dframe return dframe def get_smry( self, time_index=None, column_keys=None, - cache_eclsum=None, start_date=None, end_date=None, include_restart=True, ): - """Wrapper for EclSum.pandas_frame + """Wrapper for ecl2df.summary This gives access to the underlying data on disk without touching internalized dataframes. + The returned dataframe will have a dummy index, and the dates in + the column DATE. The DATE column will contain either datetime.datetime + or pandas.Timestamp objects. + Arguments: time_index: string indicating a resampling frequency, 'yearly', 'monthly', 'daily', 'first', 'last' or 'raw', the @@ -1077,8 +997,6 @@ def get_smry( to these. If a date in ISO-8601 format is supplied, that is used as a single date. column_keys: list of column key wildcards. None means everything. - cache_eclsum: boolean for whether to keep the loaded EclSum - object in memory after data has been loaded. start_date: str or date with first date to include. Dates prior to this date will be dropped, supplied start_date will always be included. Overridden if time_index @@ -1087,55 +1005,32 @@ def get_smry( Dates past this date will be dropped, supplied end_date will always be included. Overridden if time_index is 'first' or 'last'. + include_restart (bool): Whether to traverse restart files. Returns empty dataframe if there is no summary file, or if the column_keys are not existing. """ - - if cache_eclsum is not None: - warnings.warn( - ( - "cache_eclsum option to get_smry() is deprecated and " - "will be removed in fmu-ensemble v2.0.0" - ), - FutureWarning, - ) - else: - cache_eclsum = True - - if not isinstance(column_keys, list): - column_keys = [column_keys] - if isinstance(time_index, str) and time_index == "raw": - time_index_arg = None - elif isinstance(time_index, str): - try: - parseddate = dateutil.parser.isoparse(time_index) - time_index_arg = [parseddate] - except ValueError: - - time_index_arg = self.get_smry_dates( - freq=time_index, - start_date=start_date, - end_date=end_date, - include_restart=include_restart, - ) - elif time_index is None or isinstance(time_index, (list, np.ndarray)): - time_index_arg = time_index - else: - raise TypeError("'time_index' has to be a string, a list or None") - if self.get_eclsum(cache=cache_eclsum, include_restart=include_restart): - try: - dataframe = self.get_eclsum( - cache=cache_eclsum, include_restart=include_restart - ).pandas_frame(time_index_arg, column_keys) - except ValueError: - # We get here if we have requested non-existing column keys - return pd.DataFrame() - if not cache_eclsum: - # Ensure EclSum object can be garbage collected - self._eclsum = None - return dataframe - return pd.DataFrame() + if self.get_eclfiles() is None: + return pd.DataFrame() + try: + return ecl2df.summary.df( + self.get_eclfiles(), + time_index=time_index, + column_keys=column_keys, + start_date=start_date, + end_date=end_date, + include_restart=include_restart, + params=False, + paramfile=None, + ).reset_index() + except OSError: + # Missing or bogus UNSMRY file + return pd.DataFrame() + except ValueError: + # From libecl when requested columns keys are not found, + # or from pd.tseries.frequencies.to_offset() if frequency + # specifier is not known. + return pd.DataFrame() def get_smry_meta(self, column_keys=None): """ @@ -1195,7 +1090,7 @@ def _glob_smry_keys(self, column_keys): keys = set() for key in column_keys: if isinstance(key, str): - keys = keys.union(set(self._eclsum.keys(key))) + keys = keys.union(set(self.get_eclsum().keys(key))) return list(keys) def get_volumetric_rates(self, column_keys=None, time_index=None, time_unit=None): @@ -1224,25 +1119,19 @@ def get_smryvalues(self, props_wildcard=None): ), FutureWarning, ) - - if not self._eclsum: # check if it is cached - self.get_eclsum() - - if not self._eclsum: - return pd.DataFrame() - props = self._glob_smry_keys(props_wildcard) - if "numpy_vector" in dir(self._eclsum): + if "numpy_vector" in dir(self.get_eclsum()): data = { - prop: self._eclsum.numpy_vector(prop, report_only=False) + prop: self.get_eclsum().numpy_vector(prop, report_only=False) for prop in props } else: # get_values() is deprecated in newer libecl data = { - prop: self._eclsum.get_values(prop, report_only=False) for prop in props + prop: self.get_eclsum().get_values(prop, report_only=False) + for prop in props } - dates = self._eclsum.get_dates(report_only=False) + dates = self.get_eclsum().get_dates(report_only=False) return pd.DataFrame(data=data, index=dates) def get_smry_dates( diff --git a/src/fmu/ensemble/util/rates.py b/src/fmu/ensemble/util/rates.py index d192b552..faed7742 100644 --- a/src/fmu/ensemble/util/rates.py +++ b/src/fmu/ensemble/util/rates.py @@ -65,7 +65,9 @@ def compute_volumetric_rates(realization, column_keys, time_index, time_unit): return pd.DataFrame() cum_df = realization.get_smry(column_keys=column_keys, time_index=time_index) - # get_smry() for realizations return a dataframe indexed by 'DATE' + + if not cum_df.empty: + cum_df.set_index("DATE", inplace=True) # Compute row-wise difference, shift back one row # to get the NaN to the end, and then drop the NaN. diff --git a/src/fmu/ensemble/virtualensemble.py b/src/fmu/ensemble/virtualensemble.py index 35ba07a7..80957324 100644 --- a/src/fmu/ensemble/virtualensemble.py +++ b/src/fmu/ensemble/virtualensemble.py @@ -872,11 +872,6 @@ def get_smry(self, column_keys=None, time_index="monthly"): # Now ask the VirtualRealization to do interpolation interp = vreal.get_smry(column_keys=column_keys, time_index=time_index) - # Assume we get back a dataframe indexed by the dates from vreal - # We must reset that index, and ensure the index column - # gets a correct name - interp.index = interp.index.set_names(["DATE"]) - interp = interp.reset_index() interp["REAL"] = realidx smry_interpolated.append(interp) return pd.concat(smry_interpolated, ignore_index=True, sort=False) diff --git a/src/fmu/ensemble/virtualrealization.py b/src/fmu/ensemble/virtualrealization.py index 113d346b..7f97d252 100644 --- a/src/fmu/ensemble/virtualrealization.py +++ b/src/fmu/ensemble/virtualrealization.py @@ -287,6 +287,10 @@ def get_smry(self, column_keys=None, time_index="monthly"): Returns data for those columns that are known, unknown columns will be issued a warning for. + The returned dataframe will have a dummy index, and the dates in + the column DATE. The DATE column will contain either datetime.datetime + or pandas.Timestamp objects. + BUG: If some columns are available only in certain dataframes, we might miss them (e.g. we ask for yearly FOPT, and we have yearly smry with only WOPT data, and FOPT is only in daily @@ -359,9 +363,10 @@ def get_smry(self, column_keys=None, time_index="monthly"): ) smry = self.get_df("unsmry--" + chosen_smry)[["DATE"] + column_keys] + # index is dummy, the date is in the DATE column + smry.set_index("DATE", inplace=True) # Add the extra datetimes to interpolate at. - smry.set_index("DATE", inplace=True) smry.index = pd.to_datetime(smry.index) smry = smry.append( pd.DataFrame(index=pd.to_datetime(time_index_dt)), sort=False @@ -390,8 +395,9 @@ def get_smry(self, column_keys=None, time_index="monthly"): smry[noncum_columns].fillna(method="bfill").fillna(value=0) ) - smry.index = smry.index.set_names(["DATE"]) - return smry.loc[pd.to_datetime(time_index_dt)] + smry = smry.loc[pd.to_datetime(time_index_dt)] + smry.index.name = "DATE" + return smry.reset_index() def get_smry_dates(self, freq="monthly", normalize=False): """Return list of datetimes available in the realization diff --git a/tests/test_ecl2df.py b/tests/test_ecl2df.py index 3edcbede..673f3803 100644 --- a/tests/test_ecl2df.py +++ b/tests/test_ecl2df.py @@ -3,25 +3,16 @@ import os import logging -import pytest +import ecl2df from fmu.ensemble import ScratchEnsemble, ScratchRealization -HAVE_ECL2DF = True -try: - import ecl2df -except ImportError: - HAVE_ECL2DF = False - logger = logging.getLogger(__name__) def test_ecl2df_real(): """Check that we can utilize ecl2df on single realizations""" - if not HAVE_ECL2DF: - pytest.skip() - if "__file__" in globals(): # Easen up copying test code into interactive sessions testdir = os.path.dirname(os.path.abspath(__file__)) @@ -49,8 +40,6 @@ def test_reek(): reekens = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ) - if not HAVE_ECL2DF: - pytest.skip() def extract_compdat(kwargs): """Callback fnction to extract compdata data using ecl2df @@ -90,8 +79,6 @@ def get_smry(kwargs): reekens = ScratchEnsemble( "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" ) - if not HAVE_ECL2DF: - pytest.skip() callback_smry = reekens.apply(get_smry, column_keys="FOPT", time_index="yearly") direct_smry = reekens.get_smry(column_keys="FOPT", time_index="yearly") diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py index 3eb1930a..9ee2827c 100644 --- a/tests/test_ensemble.py +++ b/tests/test_ensemble.py @@ -71,6 +71,7 @@ def test_reek001(tmpdir): paramsdf = reekensemble.parameters # also test as property paramsdf = reekensemble.get_df("parameters.txt") assert len(paramsdf) == 5 + print(paramsdf.head()) assert len(paramsdf.columns) == 26 # 25 parameters, + REAL column paramsdf.to_csv("params.csv", index=False) @@ -152,7 +153,6 @@ def test_reek001(tmpdir): ] ) assert len(reekensemble) == 5 - print(reekensemble.files) assert len(reekensemble.files) == 24 # File discovery must be repeated for the newly added realizations @@ -499,9 +499,6 @@ def test_ensemble_ecl(): assert not reekensemble.get_wellnames("") assert len(reekensemble.get_wellnames(["OP*", "WI*"])) == 8 - # eclipse well groups list - assert len(reekensemble.get_groupnames()) == 3 - # delta between two ensembles diff = reekensemble - reekensemble assert len(diff.get_smry(column_keys=["FOPR", "FGPR", "FWCT"]).columns) == 5 @@ -829,57 +826,6 @@ def test_nonexisting(): assert not nopermission -def test_eclsumcaching(): - """Test caching of eclsum""" - - if "__file__" in globals(): - # Easen up copying test code into interactive sessions - testdir = os.path.dirname(os.path.abspath(__file__)) - else: - testdir = os.path.abspath(".") - - dirs = testdir + "/data/testensemble-reek001/" + "realization-*/iter-0" - ens = ScratchEnsemble("reektest", dirs) - - # The problem here is if you load in a lot of UNSMRY files - # and the Python process keeps them in memory. Not sure - # how to check in code that an object has been garbage collected - # but for garbage collection to work, at least the realization - # _eclsum variable must be None. - - ens.load_smry() - # Default is to do caching, so these will not be None: - assert all([x._eclsum for (idx, x) in ens.realizations.items()]) - - # If we redo this operation, the same objects should all - # be None afterwards: - ens.load_smry(cache_eclsum=False) - # cache_eclsum==None is from v1.1.5 no longer equivalent to False - assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry() - assert all([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry(cache_eclsum=False) - assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry_stats() - assert all([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry_stats(cache_eclsum=False) - assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) - - ens.get_smry_dates() - assert all([x._eclsum for (idx, x) in ens.realizations.items()]) - - # Clear the cached objects because the statement above has cached it.. - for _, realization in ens.realizations.items(): - realization._eclsum = None - - ens.get_smry_dates(cache_eclsum=False) - assert not any([x._eclsum for (idx, x) in ens.realizations.items()]) - - def test_filedescriptors(): """Test how filedescriptors are used. diff --git a/tests/test_observations.py b/tests/test_observations.py index 8c64fab0..520fc9f9 100644 --- a/tests/test_observations.py +++ b/tests/test_observations.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """Testing observations in fmu-ensemble.""" import os @@ -66,24 +65,26 @@ def test_real_mismatch(): ) realmis = obs.mismatch(real) - # Check layout of returned data - assert isinstance(realmis, pd.DataFrame) - assert len(realmis) == 1 + pd.testing.assert_frame_equal( + realmis, + pd.DataFrame( + [ + { + "OBSTYPE": "txt", + "OBSKEY": "parameters.txt/FWL", + "MISMATCH": -2.0, + "L1": 2.0, + "L2": 4.0, + "SIMVALUE": 1700, + "OBSVALUE": 1702, + "MEASERROR": 1, + "SIGN": -1, + } + ] + ), + ) assert "REAL" not in realmis.columns # should only be there for ensembles. - assert "OBSTYPE" in realmis.columns - assert "OBSKEY" in realmis.columns assert "DATE" not in realmis.columns # date is not relevant - assert "MISMATCH" in realmis.columns - assert "L1" in realmis.columns - assert "L2" in realmis.columns - - # Check actually computed values, there should only be one row with data: - assert realmis.loc[0, "OBSTYPE"] == "txt" - assert realmis.loc[0, "OBSKEY"] == "parameters.txt/FWL" - assert realmis.loc[0, "MISMATCH"] == -2 - assert realmis.loc[0, "SIGN"] == -1 - assert realmis.loc[0, "L1"] == 2 - assert realmis.loc[0, "L2"] == 4 # Another observation set: obs2 = Observations( @@ -96,12 +97,46 @@ def test_real_mismatch(): } ) realmis2 = obs2.mismatch(real) - assert len(realmis2) == 3 - assert "parameters.txt/RMS_SEED" in realmis2["OBSKEY"].values - assert "outputs.txt/top_structure" in realmis2["OBSKEY"].values - assert "npv.txt" in realmis2["OBSKEY"].values - - # assert much more! + pd.testing.assert_frame_equal( + realmis2, + pd.DataFrame( + [ + { + "OBSTYPE": "txt", + "OBSKEY": "parameters.txt/RMS_SEED", + "MISMATCH": -177148215.0, + "L1": 177148215.0, + "L2": 3.1381490077686224e16, + "SIMVALUE": 422851785, + "OBSVALUE": 600000000, + "MEASERROR": 1, + "SIGN": -1, + }, + { + "OBSTYPE": "txt", + "OBSKEY": "outputs.txt/top_structure", + "MISMATCH": 24.0, + "L1": 24.0, + "L2": 576.0, + "SIMVALUE": 3224, + "OBSVALUE": 3200, + "MEASERROR": 1, + "SIGN": 1, + }, + { + "OBSTYPE": "scalar", + "OBSKEY": "npv.txt", + "MISMATCH": 44.0, + "L1": 44.0, + "L2": 1936.0, + "SIMVALUE": 3444, + "OBSVALUE": 3400, + "MEASERROR": 1, + "SIGN": 1, + }, + ] + ), + ) # Test that we can write the observations to yaml # and verify that the exported yaml can be reimported @@ -215,6 +250,26 @@ def test_smry(): # loaded realization. mismatch = obs.mismatch(real) + # Assert the first row exactly: + pd.testing.assert_frame_equal( + mismatch.head(1), + pd.DataFrame( + [ + { + "OBSTYPE": "smry", + "OBSKEY": "WBP4:OP_1", + "DATE": datetime.date(2001, 1, 1), + "MEASERROR": 4.0, + "MISMATCH": -2.159454345703125, + "OBSVALUE": 251.0, + "SIMVALUE": 248.84054565429688, + "L1": 2.159454345703125, + "L2": 4.663243071176112, + "SIGN": -1, + } + ] + ), + ) assert len(mismatch) == 21 # later: implement counting in the obs object assert mismatch.L1.sum() > 0 assert mismatch.L2.sum() > 0 @@ -537,7 +592,6 @@ def test_ensset_mismatch(): == mismatch[mismatch.ENSEMBLE == "iter-1"].L1.sum() ) - # This is quite hard to input in dict-format. Better via YAML.. obs_pr = Observations( { "smry": [ diff --git a/tests/test_realization.py b/tests/test_realization.py index 70e1e216..1d03fab5 100644 --- a/tests/test_realization.py +++ b/tests/test_realization.py @@ -320,9 +320,7 @@ def test_volumetric_rates(): assert real.get_volumetric_rates(column_keys="FOOBAR").empty assert real.get_volumetric_rates(column_keys=["FOOBAR"]).empty assert real.get_volumetric_rates(column_keys={}).empty - - with pytest.raises(ValueError): - real.get_volumetric_rates(column_keys="FOPT", time_index="bogus") + assert real.get_volumetric_rates(column_keys="FOPT", time_index="bogus").empty mcum = real.get_smry(column_keys="FOPT", time_index="monthly") dmcum = real.get_volumetric_rates(column_keys="FOPT", time_index="monthly") @@ -330,9 +328,9 @@ def test_volumetric_rates(): # Pick 10 **random** dates to get the volumetric rates between: daily_dates = real.get_smry_dates(freq="daily", normalize=False) - subset_dates = np.random.choice(daily_dates, size=10, replace=False) + subset_dates = list(np.random.choice(daily_dates, size=10, replace=False)) subset_dates.sort() - dcum = real.get_smry(column_keys="FOPT", time_index=subset_dates) + dcum = real.get_smry(column_keys="FOPT", time_index=subset_dates).set_index("DATE") ddcum = real.get_volumetric_rates(column_keys="FOPT", time_index=subset_dates) assert ddcum["FOPR"].iloc[-1] == 0 @@ -428,21 +426,29 @@ def test_datenormalization(): realdir = os.path.join(testdir, "data/testensemble-reek001", "realization-0/iter-0") real = ensemble.ScratchRealization(realdir) raw = real.get_smry(column_keys="FOPT", time_index="raw") - assert str(raw.index[-1]) == "2003-01-02 00:00:00" + assert str(raw["DATE"].values[-1]) == "2003-01-02T00:00:00.000000000" daily = real.get_smry(column_keys="FOPT", time_index="daily") - assert str(daily.index[-1]) == "2003-01-02" + assert str(daily["DATE"].values[-1]) == "2003-01-02" monthly = real.get_smry(column_keys="FOPT", time_index="monthly") - assert str(monthly.index[-1]) == "2003-02-01" + assert str(monthly["DATE"].values[-1]) == "2003-02-01" yearly = real.get_smry(column_keys="FOPT", time_index="yearly") - assert str(yearly.index[-1]) == "2004-01-01" + assert str(yearly["DATE"].values[-1]) == "2004-01-01" weekly = real.get_smry(column_keys="FOPT", time_index="weekly") - assert str(weekly.index[-1]) == "2003-01-06" # First Monday after 2003-01-02 + assert ( + str(weekly["DATE"].values[-1]) == "2003-01-06" + ) # First Monday after 2003-01-02 weekly = real.get_smry(column_keys="FOPT", time_index="W-MON") - assert str(weekly.index[-1]) == "2003-01-06" # First Monday after 2003-01-02 + assert ( + str(weekly["DATE"].values[-1]) == "2003-01-06" + ) # First Monday after 2003-01-02 weekly = real.get_smry(column_keys="FOPT", time_index="W-TUE") - assert str(weekly.index[-1]) == "2003-01-07" # First Tuesday after 2003-01-02 + assert ( + str(weekly["DATE"].values[-1]) == "2003-01-07" + ) # First Tuesday after 2003-01-02 weekly = real.get_smry(column_keys="FOPT", time_index="W-THU") - assert str(weekly.index[-1]) == "2003-01-02" # First Thursday after 2003-01-02 + assert ( + str(weekly["DATE"].values[-1]) == "2003-01-02" + ) # First Thursday after 2003-01-02 # Check that time_index=None and time_index="raw" behaves like default raw = real.load_smry(column_keys="FOPT", time_index="raw") @@ -456,15 +462,18 @@ def test_datenormalization(): # Check that we get the same correct normalization # with load_smry() real.load_smry(column_keys="FOPT", time_index="raw") - assert str(real.get_df("unsmry--raw")["DATE"].iloc[-1]) == "2003-01-02 00:00:00" + assert ( + str(real.get_df("unsmry--raw")["DATE"].values[-1]) + == "2003-01-02T00:00:00.000000000" + ) real.load_smry(column_keys="FOPT", time_index="daily") - assert str(real.get_df("unsmry--daily")["DATE"].iloc[-1]) == "2003-01-02" + assert str(real.get_df("unsmry--daily")["DATE"].values[-1]) == "2003-01-02" real.load_smry(column_keys="FOPT", time_index="monthly") - assert str(real.get_df("unsmry--monthly")["DATE"].iloc[-1]) == "2003-02-01" + assert str(real.get_df("unsmry--monthly")["DATE"].values[-1]) == "2003-02-01" real.load_smry(column_keys="FOPT", time_index="yearly") - assert str(real.get_df("unsmry--yearly")["DATE"].iloc[-1]) == "2004-01-01" + assert str(real.get_df("unsmry--yearly")["DATE"].values[-1]) == "2004-01-01" real.load_smry(column_keys="FOPT", time_index="weekly") - assert str(real.get_df("unsmry--weekly")["DATE"].iloc[-1]) == "2003-01-06" + assert str(real.get_df("unsmry--weekly")["DATE"].values[-1]) == "2003-01-06" def test_singlereal_ecl(tmp="TMP"): @@ -514,7 +523,7 @@ def test_singlereal_ecl(tmp="TMP"): # Try ISO-date for time_index: singledata = real.get_smry(time_index="2000-05-05", column_keys="FOPT") assert "FOPT" in singledata - assert "2000-05-05" in singledata.index + assert str(singledata["DATE"].values[0]).startswith("2000-05-05") # start and end should be included: assert ( diff --git a/tests/test_virtualrealization.py b/tests/test_virtualrealization.py index f98c2d9b..bed01c12 100644 --- a/tests/test_virtualrealization.py +++ b/tests/test_virtualrealization.py @@ -155,10 +155,10 @@ def test_get_smry(): assert all(vfopt == fopt) # But note that the dtype of the index in each dataframe differs # vfopt.index.dtype == datetime, while fopt.index.dtype == object - assert len(fopt.columns) == 1 # DATE is index (unlabeled) + assert len(fopt.columns) == 2 # DATE is the first column dvfopt = vreal.get_smry(column_keys="FOPT", time_index="daily") - assert all(dvfopt.diff() >= 0) + assert all(dvfopt["FOPT"].diff().dropna() >= 0) # Linear interpolation should give many unique values: assert len(dvfopt["FOPT"].unique()) == 1462 # Length is here 1462 while daily smry for the scratchrealization @@ -256,7 +256,7 @@ def test_get_smry2(): alldefaults = vreal.get_smry() assert len(alldefaults) == monthly_length - assert len(alldefaults.columns) == 49 + assert len(alldefaults.columns) == 50 def test_get_smry_cumulative():