First stab at using ecl2df for summary file extraction

Works, but there is a lot of cleanup to be done in realization.py and ensemble.py to fully remove the eclsum objects from fmu-ensemble.
equinor · Feb 1, 2021 · b27978a · b27978a
1 parent 82cf5fa
commit b27978a
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 167 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -9,9 +9,6 @@ exclude = docs,
 [aliases]
 test = pytest
 
-[tool:pytest]
-addopts = --verbose -x
-
 [build_sphinx]
 all-files = 1
 warning-is-error = 1

diff --git a/setup.py b/setup.py
@@ -22,6 +22,7 @@
 
 REQUIREMENTS = [
     "ecl>=2.9",
+    "ecl2df",
     "numpy",
     "pandas",
     "pyyaml>=5.1",

diff --git a/src/fmu/ensemble/realization.py b/src/fmu/ensemble/realization.py
@@ -25,20 +25,14 @@
 from ecl.grid import EclGrid
 from ecl import EclFileFlagEnum
 
+import ecl2df
+
 from .virtualrealization import VirtualRealization
 from .realizationcombination import RealizationCombination
 from .util import parse_number, flatten, shortcut2path
 from .util.rates import compute_volumetric_rates
 from .util.dates import unionize_smry_dates
 
-HAVE_ECL2DF = False
-try:
-    import ecl2df
-
-    HAVE_ECL2DF = True
-except ImportError:
-    HAVE_ECL2DF = False
-
 logger = logging.getLogger(__name__)
 
 
@@ -104,6 +98,8 @@ def __init__(
         self.files = pd.DataFrame(
             columns=["FULLPATH", "FILETYPE", "LOCALPATH", "BASENAME"]
         )
+        self.eclfiles = None  # ecl2df.EclFiles object
+
         self._eclsum = None  # Placeholder for caching
         self._eclsum_include_restart = None  # Flag for cached object
 
@@ -851,33 +847,51 @@ def get_eclfiles(self):
         Returns:
             ecl2df.EclFiles. None if nothing found
         """
-        if not HAVE_ECL2DF:
-            logger.warning("ecl2df not installed. Skipping")
-            return None
-        data_file_row = self.files[self.files["FILETYPE"] == "DATA"]
+        data_file_rows = self.files[self.files["FILETYPE"] == "DATA"]
         data_filename = None
-        if len(data_file_row) == 1:
-            data_filename = data_file_row["FULLPATH"].values[0]
+        unsmry_file_rows = self.files[self.files["FILETYPE"] == "UNSMRY"]
+        unsmry_filename = None
+        if len(data_file_rows) == 1:
+            data_filename = data_file_rows["FULLPATH"].values[0]
+        elif len(unsmry_file_rows) == 1:
+            unsmry_filename = unsmry_file_rows["FULLPATH"].values[0]
+            # We construct the DATA file, even though it might not exist:
+            data_filename = unsmry_filename.replace(".UNSMRY", ".DATA")
         elif self._autodiscovery:
             data_fileguess = os.path.join(self._origpath, "eclipse/model", "*.DATA")
             data_filenamelist = glob.glob(data_fileguess)
             if not data_filenamelist:
-                return None  # No filename matches *DATA
+                return None  # No filename matches *DATA or *UNSMRY
             if len(data_filenamelist) > 1:
                 logger.warning(
                     (
                         "Multiple DATA files found, "
                         "consider turning off auto-discovery"
                     )
                 )
-            data_filename = data_filenamelist[0]
-            self.find_files(data_filename)
+            if data_filenamelist:
+                data_filename = data_filenamelist[0]
+                self.find_files(data_filename)
+
+            unsmry_fileguess = os.path.join(self._origpath, "eclipse/model", "*.UNSMRY")
+            unsmry_filenamelist = glob.glob(unsmry_fileguess)
+            if not unsmry_filenamelist:
+                return None  # No filename matches
+            if len(unsmry_filenamelist) > 1:
+                logger.warning(
+                    "Multiple UNSMRY files found, consider turning off auto-discovery"
+                )
+            unsmry_filename = unsmry_filenamelist[0]
+            self.find_files(unsmry_filename)
+
         else:
-            # There is no DATA file to be found.
-            logger.warning("No DATA file found!")
+            logger.warning("No DATA and/or UNSMRY file found!")
             return None
         if not os.path.exists(data_filename):
-            return None
+            if unsmry_filename is not None:
+                return ecl2df.EclFiles(unsmry_filename.replace(".UNSMRY", ".DATA"))
+            else:
+                return None
         return ecl2df.EclFiles(data_filename)
 
     def get_eclsum(self, cache=True, include_restart=True):
@@ -945,102 +959,35 @@ def get_eclsum(self, cache=True, include_restart=True):
 
         return eclsum
 
-    def load_smry(
-        self,
-        time_index="raw",
-        column_keys=None,
-        cache_eclsum=True,
-        start_date=None,
-        end_date=None,
-        include_restart=True,
-    ):
-        """Produce dataframe from Summary data from the realization
-
-        When this function is called, the dataframe will be
-        internalized.  Internalization of summary data in a
-        realization object supports different time_index, but there is
-        no handling of multiple sets of column_keys. The cached data
-        will be called
-
-          'share/results/tables/unsmry--<time_index>.csv'
-
-        where <time_index> is among 'yearly', 'monthly', 'daily', 'first',
-        'last' or 'raw' (meaning the raw dates in the SMRY file), depending
-        on the chosen time_index. If a custom time_index (list
-        of datetime) was supplied, <time_index> will be called 'custom'.
-
-        Wraps ecl.summary.EclSum.pandas_frame()
-
-        See also get_smry()
-
-        Args:
-            time_index: string indicating a resampling frequency,
-               'yearly', 'monthly', 'daily', 'first', 'last' or 'raw', the
-               latter will return the simulated report steps (also default).
-               If a list of DateTime is supplied, data will be resampled
-               to these.
-            column_keys: list of column key wildcards. None means everything.
-            cache_eclsum: boolean for whether to keep the loaded EclSum
-                object in memory after data has been loaded.
-            start_date: str or date with first date to include.
-                Dates prior to this date will be dropped, supplied
-                start_date will always be included. Overridden if time_index
-                is 'first' or 'last'.
-            end_date: str or date with last date to be included.
-                Dates past this date will be dropped, supplied
-                end_date will always be included. Overridden if time_index
-                is 'first' or 'last'.
-            include_restart: boolean sent to libecl for whether restart
-                files should be traversed.
-
-        Returns:
-            DataFrame with summary keys as columns and dates as indices.
-                Empty dataframe if no summary is available or column
-                keys do not exist.
-            DataFrame: with summary keys as columns and dates as indices.
-                Empty dataframe if no summary is available.
-        """
-        if not self.get_eclsum(cache=cache_eclsum):
-            # Return empty, but do not store the empty dataframe in self.data
-            return pd.DataFrame()
-        time_index_path = time_index
-        if time_index == "raw":
-            time_index_arg = None
-        elif isinstance(time_index, str):
-            # Note: This call will recache the smry object.
-            time_index_arg = self.get_smry_dates(
-                freq=time_index,
-                start_date=start_date,
-                end_date=end_date,
-                include_restart=include_restart,
-            )
-        elif isinstance(time_index, (list, np.ndarray)):
-            time_index_arg = time_index
-            time_index_path = "custom"
-        elif time_index is None:
-            time_index_path = "raw"
-            time_index_arg = time_index
-        else:
-            raise TypeError("'time_index' has to be a string, a list or None")
-
-        if not isinstance(column_keys, list):
-            column_keys = [column_keys]
-
-        # Do the actual work:
-        dframe = self.get_eclsum(
-            cache=cache_eclsum, include_restart=include_restart
-        ).pandas_frame(time_index_arg, column_keys)
-        dframe = dframe.reset_index()
-        dframe.rename(columns={"index": "DATE"}, inplace=True)
-
-        # Cache the result:
-        localpath = "share/results/tables/unsmry--" + time_index_path + ".csv"
-        self.data[localpath] = dframe
-
-        # Do this to ensure that we cut the rope to the EclSum object
-        # Can be critical for garbage collection
-        if not cache_eclsum:
-            self._eclsum = None
+    def load_smry(self, **kwargs):
+        """Wrap around get_smry(), but also cache the result"""
+
+        # This change of indexing is peculiar for load_smry() vs get_smry().
+        # It might change in fmu-ensemble 2.0 to always return a datetime64
+        # index.
+        dframe = self.get_smry(**kwargs).reset_index()
+
+        cachename = None
+        # Cache the result for supported time indices:
+        if "time_index" not in kwargs or kwargs["time_index"] is None:
+            cachename = "raw"
+        elif isinstance(kwargs["time_index"], list):
+            cachename = "custom"
+        elif str(kwargs["time_index"]) in [
+            "raw",
+            "first",
+            "last",
+            "report",
+            "daily",
+            "weekly",
+            "monthly",
+            "yearly",
+        ]:
+            cachename = kwargs["time_index"]
+
+        if cachename:
+            localpath = "share/results/tables/unsmry--" + cachename + ".csv"
+            self.data[localpath] = dframe
         return dframe
 
     def get_smry(
@@ -1051,8 +998,9 @@ def get_smry(
         start_date=None,
         end_date=None,
         include_restart=True,
+        datetimeindex=False,
     ):
-        """Wrapper for EclSum.pandas_frame
+        """Wrapper for ecl2df.summary
 
         This gives access to the underlying data on disk without
         touching internalized dataframes.
@@ -1075,43 +1023,37 @@ def get_smry(
                 Dates past this date will be dropped, supplied
                 end_date will always be included. Overridden if time_index
                 is 'first' or 'last'.
+            include_restart (bool): Whether to traverse restart files.
+            datetimeindex (bool): Set to True if a datetime64 indes is wanted.
 
         Returns empty dataframe if there is no summary file, or if the
         column_keys are not existing.
         """
-        if not isinstance(column_keys, list):
-            column_keys = [column_keys]
-        if isinstance(time_index, str) and time_index == "raw":
-            time_index_arg = None
-        elif isinstance(time_index, str):
-            try:
-                parseddate = dateutil.parser.isoparse(time_index)
-                time_index_arg = [parseddate]
-            except ValueError:
-
-                time_index_arg = self.get_smry_dates(
-                    freq=time_index,
-                    start_date=start_date,
-                    end_date=end_date,
-                    include_restart=include_restart,
-                )
-        elif time_index is None or isinstance(time_index, (list, np.ndarray)):
-            time_index_arg = time_index
-        else:
-            raise TypeError("'time_index' has to be a string, a list or None")
-        if self.get_eclsum(cache=cache_eclsum, include_restart=include_restart):
-            try:
-                dataframe = self.get_eclsum(
-                    cache=cache_eclsum, include_restart=include_restart
-                ).pandas_frame(time_index_arg, column_keys)
-            except ValueError:
-                # We get here if we have requested non-existing column keys
-                return pd.DataFrame()
-            if not cache_eclsum:
-                # Ensure EclSum object can be garbage collected
+        try:
+            dframe = ecl2df.summary.df(
+                self.get_eclfiles(),
+                time_index=time_index,
+                column_keys=column_keys,
+                start_date=start_date,
+                end_date=end_date,
+                include_restart=include_restart,
+                params=False,
+                paramfile=None,
+                datetime=datetimeindex,
+            )
+            if cache_eclsum:
+                if self.get_eclfiles():
+                    # This is necessary for tests to pass, but might not
+                    # be the way to do it since ecl2df should take full
+                    # responsibility for the eclsum objects.
+                    self._eclsum = self.get_eclfiles().get_eclsum()
+            else:
+                # Do this to ensure that we cut the rope to the EclSum object
+                # Can be critical for garbage collection
                 self._eclsum = None
-            return dataframe
-        return pd.DataFrame()
+            return dframe
+        except FileNotFoundError:
+            return pd.DataFrame()
 
     def get_smry_meta(self, column_keys=None):
         """

diff --git a/tests/test_ecl2df.py b/tests/test_ecl2df.py
@@ -3,25 +3,16 @@
 import os
 import logging
 
-import pytest
+import ecl2df
 
 from fmu.ensemble import ScratchEnsemble, ScratchRealization
 
-HAVE_ECL2DF = True
-try:
-    import ecl2df
-except ImportError:
-    HAVE_ECL2DF = False
-
 logger = logging.getLogger(__name__)
 
 
 def test_ecl2df_real():
     """Check that we can utilize ecl2df on single realizations"""
 
-    if not HAVE_ECL2DF:
-        pytest.skip()
-
     if "__file__" in globals():
         # Easen up copying test code into interactive sessions
         testdir = os.path.dirname(os.path.abspath(__file__))
@@ -49,8 +40,6 @@ def test_reek():
     reekens = ScratchEnsemble(
         "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0"
     )
-    if not HAVE_ECL2DF:
-        pytest.skip()
 
     def extract_compdat(kwargs):
         """Callback fnction to extract compdata data using ecl2df
@@ -90,8 +79,6 @@ def get_smry(kwargs):
     reekens = ScratchEnsemble(
         "reektest", testdir + "/data/testensemble-reek001/" + "realization-*/iter-0"
     )
-    if not HAVE_ECL2DF:
-        pytest.skip()
 
     callback_smry = reekens.apply(get_smry, column_keys="FOPT", time_index="yearly")
     direct_smry = reekens.get_smry(column_keys="FOPT", time_index="yearly")

diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
@@ -152,7 +152,6 @@ def test_reek001(tmpdir):
         ]
     )
     assert len(reekensemble) == 5
-    print(reekensemble.files)
     assert len(reekensemble.files) == 24
 
     # File discovery must be repeated for the newly added realizations