fix: RNTuple ug-fixing offset array concatenation, adding filter_name (…

…#1285) * Fixing cardinality cluster edges * style: pre-commit fixes * Ruff CI error fix * Pull request edit * style: pre-commit fixes * Testing newest ROOT version * style: pre-commit fixes * Update src/uproot/models/RNTuple.py Co-authored-by: Andres Rios Tascon <[email protected]> * Update src/uproot/models/RNTuple.py Co-authored-by: Andres Rios Tascon <[email protected]> * style: pre-commit fixes --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Andres Rios Tascon <[email protected]> Co-authored-by: Jim Pivarski <[email protected]>
scikit-hep · Sep 12, 2024 · dc19ce9 · dc19ce9
1 parent 1d5a17a
commit dc19ce9
Showing 1 changed file with 42 additions and 9 deletions.
diff --git a/src/uproot/models/RNTuple.py b/src/uproot/models/RNTuple.py
@@ -7,6 +7,7 @@
 
 import struct
 from collections import defaultdict
+from itertools import accumulate
 
 import numpy
 
@@ -60,8 +61,20 @@ def _keys(self):
                 keys.append(fr.field_name)
         return keys
 
-    def keys(self):
-        return self._keys
+    def keys(
+        self,
+        *,
+        filter_name=None,
+        filter_typename=None,
+        recursive=False,
+        full_paths=True,
+        # TODO: some arguments might be missing when compared with TTree. Solve when blocker is present in dask/coffea.
+    ):
+        if filter_name:
+            # Return keys from the filter_name list:
+            return [key for key in self._keys if key in filter_name]
+        else:
+            return self._keys
 
     def read_members(self, chunk, cursor, context, file):
         if uproot._awkwardforth.get_forth_obj(context) is not None:
@@ -480,10 +493,28 @@ def read_pagedesc(self, destination, desc, dtype_str, dtype, nbits, split):
         # needed to chop off extra bits incase we used `unpackbits`
         destination[:] = content[:num_elements]
 
-    def read_col_pages(self, ncol, cluster_range, pad_missing_ele=False):
-        res = numpy.concatenate(
-            [self.read_col_page(ncol, i) for i in cluster_range], axis=0
-        )
+    def read_col_pages(self, ncol, cluster_range, dtype_byte, pad_missing_ele=False):
+        arrays = [self.read_col_page(ncol, i) for i in cluster_range]
+
+        # Check if column stores offset values for jagged arrays (splitindex64) (applies to cardinality cols too):
+        if (
+            dtype_byte == uproot.const.rntuple_col_type_to_num_dict["splitindex64"]
+            or dtype_byte == uproot.const.rntuple_col_type_to_num_dict["splitindex32"]
+        ):
+            # Extract the last offset values:
+            last_elements = [
+                arr[-1] for arr in arrays[:-1]
+            ]  # First value always zero, therefore skip first arr.
+            # Compute cumulative sum using itertools.accumulate:
+            last_offsets = list(accumulate(last_elements))
+            # Add the offsets to each array
+            for i in range(1, len(arrays)):
+                arrays[i] += last_offsets[i - 1]
+            # Remove the first element from every sub-array except for the first one:
+            arrays = [arrays[0]] + [arr[1:] for arr in arrays[1:]]
+
+        res = numpy.concatenate(arrays, axis=0)
+
         if pad_missing_ele:
             first_ele_index = self.column_records[ncol].first_ele_index
             res = numpy.pad(res, (first_ele_index, 0))
@@ -530,8 +561,8 @@ def read_col_page(self, ncol, cluster_i):
 
     def arrays(
         self,
-        filter_names="*",
-        filter_typenames=None,
+        filter_name="*",
+        filter_typename=None,
         entry_start=0,
         entry_stop=None,
         decompression_executor=None,
@@ -553,7 +584,7 @@ def arrays(
         )
 
         form = self.to_akform().select_columns(
-            filter_names, prune_unions_and_records=False
+            filter_name, prune_unions_and_records=False
         )
         # only read columns mentioned in the awkward form
         target_cols = []
@@ -563,9 +594,11 @@ def arrays(
             if "column" in key and "union" not in key:
                 key_nr = int(key.split("-")[1])
                 dtype_byte = self.column_records[key_nr].type
+
                 content = self.read_col_pages(
                     key_nr,
                     range(start_cluster_idx, stop_cluster_idx),
+                    dtype_byte=dtype_byte,
                     pad_missing_ele=True,
                 )
                 if "cardinality" in key: