WIP: Add support for reading OpenPMD files

Co-authored-by: Krishiv Bhatia <[email protected]> Co-authored-by: Gabriele Bozzola <[email protected]>"
Sbozzolo · May 26, 2024 · 11e75ef · 11e75ef
1 parent 970c574
commit 11e75ef
Show file tree

Hide file tree

Showing 5 changed files with 575 additions and 617 deletions.
diff --git a/docs/features.rst b/docs/features.rst
@@ -119,7 +119,7 @@ Units
 Grid Data
 ---------
 
-- Read 1D, 2D, and 3D ASCII and HDF5 files as ``HierarchicalGridData``, which supports:
+- Read 1D, 2D, and 3D ASCII, HDF5 and OpenPMD files as ``HierarchicalGridData``, which supports:
 
   - working with multiple components and refinement levels;
   - handling ghost-zones;

diff --git a/docs/index.rst b/docs/index.rst
@@ -51,7 +51,7 @@ For a full list of available features, see the :doc:`features page <features>`.
   (:py:mod:`~.tensor`).
 - Work with 1D, 2D, and 3D grid functions (:py:mod:`~.grid_data`,
   :py:mod:`~.cactus_grid_functions`) as output by ``CarpetIOHDF5`` or
-  ``CarpetIOASCII``.
+  ``CarpetIOASCII``, or in OpenPMD format.
 - Work with horizon data from (:py:mod:`~.cactus_horizons`) as output by
   ``QuasiLocalMeasures`` and ``AHFinderDirect``.
 - Perform common operations with horizons (:py:mod:`~.hor_utils`).

diff --git a/kuibit/cactus_grid_functions.py b/kuibit/cactus_grid_functions.py
@@ -1004,7 +1004,7 @@ def _parse_file(self, path):
         # file. We keep a collection of all these in the variable self.alldata
         try:
             with h5py.File(path, "r") as f:
-                for group in f.keys():
+                for group in f:
                     matched = self.rx_group_name.match(group)
                     # If this is not an interesting group, just skip it
                     if not matched:
@@ -1246,7 +1246,7 @@ def clear_cache(self):
         for filename, file_reader in self.alldata.items():
             for iteration, iteration_reader in file_reader.items():
                 for ref_level, ref_level_reader in iteration_reader.items():
-                    for component in ref_level_reader.keys():
+                    for component in ref_level_reader:
                         self.alldata[filename][iteration][ref_level][
                             component
                         ] = None
@@ -1301,37 +1301,39 @@ class OneGridFunctionOpenPMD(BaseOneGridFunction):
 
     This class is derived from :py:class:`~.BaseOneGridFunction` and implements
     the reading facilities.
-
     """
 
     # This class implements the details on how to read the data for OpenPMD
     # files, most of the functionalities of the class are in
     # OneGridFunctionBase.
 
-    # Let's unpack the regex
-    #
-    # 1. ^ $ match the beginning and end of string
-    # 2. (\w+) match a word (the variable name)
-    # 3. _ is the literal underscore
-    # 4. (?:rl|lev) is a non-capturing group that matches the literals
-    #    rl or lev
-    # 5. (\d+) matches a number, the refinement level
-    _pattern_group_name = r"^(\w+)_(?:rl|lev)(\d+)$"
-
-    def __init__(self, allfiles, var_name: str, dimension):
+    def __init__(self, allfiles, var_name: str, dimension, mesh_basename: str):
         """Constructor.
 
         :param allfiles: Paths of files associated to the variable.
         :type allfiles: list of str
         :param var_name: Variable name.
         :type var_name: str
+        :param mesh_basename: Basename of the OpenPMD mesh. Typically, <thornname>_<groupname>
+        :type mesh_basename: str
 
         """
         self.dimension = dimension
+        self.mesh_basename = mesh_basename
         self.map = None
 
         self._iterations_to_times = {}
 
+        # Let's unpack the regex
+        #
+        # 1. ^ $ match the beginning and end of string
+        # 2. {mesh_basename} is substituted with the prefix
+        # 3. _ is the literal underscore
+        # 4. (?:rl|lev) is a non-capturing group that matches the literals
+        #    rl or lev
+        # 5. (\d+) matches a number, the refinement level
+        self._pattern_mesh_name = rf"^{mesh_basename}_(?:rl|lev)(\d+)$"
+
         super().__init__(allfiles, var_name)
 
         # super() will fill the other variables that we need for dataset_format
@@ -1341,19 +1343,16 @@ def __init__(self, allfiles, var_name: str, dimension):
         # OpenPMD data does not contain ghost zones
         self.are_ghostzones_in_files = False
 
-    def _parse_file(self, path):
+    def _parse_file(self, path: str):
         """Read the content of the given file (without reading the data).
 
+        For OpenPMD files, a "file" is really a directory.
+
         :param path: Path of the file.
         :type path: str
 
         """
-        rx_mesh = re.compile(self._pattern_group_name)
-
-        # TODO: (Performance)
-        #
-        # Directly find the name of the relevant mesh, instead of looping over
-        # everything.
+        rx_mesh = re.compile(self._pattern_mesh_name)
 
         with openpmd_series(path) as series:
             iter_open_pmd = series.iterations
@@ -1362,35 +1361,27 @@ def _parse_file(self, path):
                 all_meshes = iteration_obj.meshes
                 for mesh_name, mesh_obj in all_meshes.items():
                     matched = rx_mesh.match(mesh_name)
-                    if matched is None:
-                        raise RuntimeError(f"Could not parse mesh {mesh_name}")
-                    ref_level = int(matched.group(2))
-
-                    # Here is where we prepare are nested alldata dictionary
-                    alldata_file = self.alldata.setdefault(path, {})
-                    alldata_iteration = alldata_file.setdefault(
-                        int(iteration), {}
-                    )
-                    alldata_ref_level = alldata_iteration.setdefault(
-                        ref_level, {}
-                    )
-
-                    # Loop through all variables in specific mesh and index all
-                    # the variables and chunks (ie, components)
-                    for variable in mesh_obj.items():
-                        var_name = variable[0]
-                        self.mesh_name = mesh_name
-                        chunks = mesh_obj[var_name].available_chunks()
-                        chunk_no = 0
-                        for _chunk in chunks:
-                            component = chunk_no
+                    if matched is not None:
+                        ref_level = matched.group(1)
+                        # Here is where we prepare are nested alldata dictionary
+                        alldata_file = self.alldata.setdefault(path, {})
+                        alldata_iteration = alldata_file.setdefault(
+                            int(iteration), {}
+                        )
+                        alldata_ref_level = alldata_iteration.setdefault(
+                            int(ref_level), {}
+                        )
+                        component = 0
+                        for _chunk in mesh_obj[
+                            self.var_name
+                        ].available_chunks():
                             # We set the actual data to None, and we will read it in
                             # _read_component_as_uniform_grid_data upon request
                             alldata_ref_level.setdefault(component, None)
-                            chunk_no += 1
+                            component += 1
 
     def _read_component_as_uniform_grid_data(
-        self, path, iteration, ref_level, component
+        self, path: str, iteration: int, ref_level: int, component: int
     ):
         """Return the component at the given iteration, refinement level, and component.
 
@@ -1407,51 +1398,36 @@ def _read_component_as_uniform_grid_data(
         :rtype: :py:class:`~.UniformGridData`
 
         """
-        rx_mesh = re.compile(self._pattern_group_name)
+        # ref_level is an integer like 0, 1, 2, 3, 4, 5.
+        # So it is formatted to 2 digits to give 00, 01, 04, 10, 11
+        mesh_name = f"{self.mesh_basename}_lev{ref_level:02d}"
 
         if self.alldata[path][iteration][ref_level][component] is None:
             with openpmd_series(path) as series:
                 time = series.iterations[iteration].time
-                mesh = series.iterations[iteration].meshes
-                for mesh_name, mesh_obj in mesh.items():
-                    matched = rx_mesh.match(mesh_name)
-                    if matched is None:
-                        raise RuntimeError(f"Could not parse mesh {mesh_name}")
-                    ref_level_matched = int(matched.group(2))
-
-                    if ref_level == ref_level_matched:
-                        origin = np.array(mesh_obj.grid_global_offset)
-                        dx = np.array(mesh_obj.grid_spacing)
-                        for variable, variable_obj in mesh_obj.items():
-                            if variable == self.var_name:
-                                chunk = variable_obj.available_chunks()[
-                                    component
-                                ]
-
-                                offset = np.array(chunk.offset)
-                                shape = chunk.extent
-
-                                # Do the actual reading
-                                data = variable_obj.load_chunk(
-                                    chunk.offset, chunk.extent
-                                )
-                                series.flush()
-
-                                grid = grid_data.UniformGrid(
-                                    shape,
-                                    x0=(origin + offset * dx),
-                                    dx=dx,
-                                    ref_level=ref_level,
-                                    num_ghost=[0, 0, 0],
-                                    time=time,
-                                    iteration=iteration,
-                                    component=component,
-                                )
-
-                                self.alldata[path][iteration][ref_level][
-                                    component
-                                ] = grid_data.UniformGridData(grid, data)
-
+                mesh_obj = series.iterations[iteration].meshes[mesh_name]
+                origin = np.array(mesh_obj.grid_global_offset)
+                dx = np.array(mesh_obj.grid_spacing)
+                mrc = mesh_obj[self.var_name]
+                chunk = mrc.available_chunks()[component]
+                offset = np.array(chunk.offset)
+                shape = chunk.extent
+                # Do the actual reading
+                data = mrc.load_chunk(chunk.offset, chunk.extent)
+                series.flush()
+                grid = grid_data.UniformGrid(
+                    shape,
+                    x0=(origin + offset * dx),
+                    dx=dx,
+                    ref_level=ref_level,
+                    num_ghost=[0, 0, 0],
+                    time=time,
+                    iteration=iteration,
+                    component=component,
+                )
+                self.alldata[path][iteration][ref_level][component] = (
+                    grid_data.UniformGridData(grid, data)
+                )
         return self.alldata[path][iteration][ref_level][component]
 
     def clear_cache(self):
@@ -1466,12 +1442,12 @@ def clear_cache(self):
         for filename, file_reader in self.alldata.items():
             for iteration, iteration_reader in file_reader.items():
                 for ref_level, ref_level_reader in iteration_reader.items():
-                    for component in ref_level_reader.keys():
+                    for component in ref_level_reader:
                         self.alldata[filename][iteration][ref_level][
                             component
                         ] = None
 
-    def time_at_iteration(self, iteration):
+    def time_at_iteration(self, iteration: int):
         """Return the time corresponding to the provided iteration.
 
         :param iteration: Iteration.
@@ -1533,6 +1509,21 @@ class AllGridFunctions:
         (0, 1, 2): "xyz",
     }
 
+    # regex to match mesh_name like z4c_allc_lev05, admbasex_curv_lev00,
+    # admbasex_lapse_lev05
+    # Let's uinpack this regex:
+    # 1. ^ and $ mean that we are matching the entire string
+    # 2. (\w+) is a capturing group that matches a word. This is thorn_groupname
+    #    For example, wavetoy_state
+    # 3. _ matches the literal underscore
+    # 4. (?: ) is a non-capturing group, rl|lev means that we match one of the two
+    # 5. (\d+) matches numbers
+    _mesh_name_pattern = r"^(\w+)_(?:rl|lev)(\d+)$"
+
+    # Mapping between a variable name and the prefix for the mesh name in the
+    # OpenPMD file. Typically, thorname_groupname.
+    _openpmd_mesh_basenames = {}
+
     # The oddball case is with OpenPMD files. OpenPMD files are actually folders
     # with extension .bp4 and they are always 3D.
 
@@ -1613,6 +1604,7 @@ def __init__(self, allfiles, dimension, num_ghost=None):
         rx_h5 = re.compile(h5_pattern)
         rx_ascii = re.compile(ascii_pattern)
         rx_openpmd = re.compile(openpmd_pattern)
+        rx_mesh = re.compile(self._mesh_name_pattern)
 
         # Here we scan all the files and find those with a name that match
         # one of our regular expressions.
@@ -1659,7 +1651,7 @@ def __init__(self, allfiles, dimension, num_ghost=None):
                     try:
                         with h5py.File(f, "r") as h5f:
                             # Here group is in the sense of HDF5 group
-                            for group in h5f.keys():
+                            for group in h5f:
                                 group_matched = rx_group_name.match(group)
                                 # If this is not an interesting group, just skip it
                                 if not group_matched:
@@ -1726,7 +1718,7 @@ def __init__(self, allfiles, dimension, num_ghost=None):
                             opener=opener,
                             opener_mode=opener_mode,
                         )
-                        for variable_name in column_description.keys():
+                        for variable_name in column_description:
                             var_list = self._vars_ascii_files.setdefault(
                                 variable_name, set()
                             )
@@ -1739,16 +1731,25 @@ def __init__(self, allfiles, dimension, num_ghost=None):
                 # bp4 file.
                 dir_path = os.path.split(f)[0]
                 with openpmd_series(dir_path) as series:
-                    iterations = series.iterations
-                    for _iteration, iteration_obj in iterations.items():
-                        all_meshes = iteration_obj.meshes
-                        for _mesh_name, mesh_obj in all_meshes.items():
-                            for mesh_var in mesh_obj.items():
-                                variable_name = mesh_var[0]
+                    for _iteration, iteration_obj in series.iterations.items():
+                        for (
+                            mesh_name,
+                            mesh_obj,
+                        ) in iteration_obj.meshes.items():
+                            matched = rx_mesh.match(mesh_name)
+                            if matched is None:
+                                raise RuntimeError(
+                                    f"Could not parse mesh {mesh_name}"
+                                )
+                            mesh_basename = matched.group(1)
+                            for variable_name in mesh_obj:
                                 var_list = self._vars_openpmd_files.setdefault(
                                     variable_name, set()
                                 )
                                 var_list.add(dir_path)
+                                self._openpmd_mesh_basenames[variable_name] = (
+                                    mesh_basename
+                                )
 
         # What pythonize_name_dict does is to make the various variables
         # accessible as attributes, e.g. self.fields.rho
@@ -1771,6 +1772,10 @@ def __getitem__(self, key):
                     self._vars_openpmd_files[var_name],
                     var_name,
                     self.dimension,
+                    # Get mesh_basenames corresponding to var name from dict.
+                    # mesh_basename is thornname_groupname, where groupname is the
+                    # name of the variable group that contains var_name
+                    self._openpmd_mesh_basenames[var_name],
                 )
             elif var_name in self._vars_ascii_files:
                 if self.num_ghost is None:
@@ -1989,7 +1994,7 @@ def total_filesize(self, unit="MB"):
         """
         # First we find all the unique files
         allfiles = set()
-        for dim in self._all_griddata.keys():
+        for dim in self._all_griddata:
             allfiles.update(self[dim].allfiles)
         # OpenPMD "files" are really directories, so if we want to measure their
         # size accurately we have to open them up