diff --git a/strax/context.py b/strax/context.py index 3fe7c475..2f58f262 100644 --- a/strax/context.py +++ b/strax/context.py @@ -1908,15 +1908,8 @@ def get_df( """ df = self.get_array(run_id, targets, save=save, max_workers=max_workers, **kwargs) - try: - return pd.DataFrame.from_records(df) - except Exception as e: - if "Data must be 1-dimensional" in str(e): - raise ValueError( - f"Cannot load '{targets}' as a dataframe because it has " - "array fields. Please use get_array." - ) - raise + + return strax.convert_structured_array_to_df(df, log=self.log) def get_zarr( self, diff --git a/strax/utils.py b/strax/utils.py index 3f3ea41b..430c0e48 100644 --- a/strax/utils.py +++ b/strax/utils.py @@ -804,3 +804,40 @@ def convert_tuple_to_list(init_func_input): else: # if not a container, return. i.e. int, float, bytes, str etc. return func_input + + +@export +def convert_structured_array_to_df(structured_array, log=None): + """Convert a structured numpy array to a pandas DataFrame. + + Parameters: + structured_array (numpy.ndarray): The structured array to be converted. + Returns: + pandas.DataFrame: The converted DataFrame. + + """ + + if log is None: + import logging + + log = logging.getLogger("strax_array_to_df") + + data_dict = {} + converted_cols = [] + for name in structured_array.dtype.names: + col = structured_array[name] + if col.ndim > 1: + # Convert n-dimensional columns to lists of ndarrays + data_dict[name] = [np.array(row) for row in col] + converted_cols.append(name) + else: + data_dict[name] = col + + if converted_cols: + log.warning( + f"Columns {converted_cols} contain non-scalar entries. " + "Some pandas functions (e.g., groupby, apply) might " + "not perform as expected on these columns." + ) + + return pd.DataFrame(data_dict) diff --git a/tests/test_core.py b/tests/test_core.py index ca3063eb..b821b29e 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -33,6 +33,28 @@ def test_core(allow_multiprocess, max_workers, processor): assert bla.dtype == strax.peak_dtype() +@processing_conditions +def test_core_df(allow_multiprocess, max_workers, processor, caplog): + """Test that get_df works with N-dimensional data.""" + """Test that get_df works with N-dimensional data.""" + mystrax = strax.Context( + storage=[], + register=[Records, Peaks], + processors=[processor], + allow_multiprocess=allow_multiprocess, + use_per_run_defaults=True, + ) + + df = mystrax.get_df(run_id=run_id, targets="peaks", max_workers=max_workers) + p = mystrax.get_single_plugin(run_id, "records") + assert len(df.loc[0, "data"]) == 200 + assert len(df) == p.config["recs_per_chunk"] * p.config["n_chunks"] + assert ( + "contain non-scalar entries. Some pandas functions (e.g., groupby, apply)" + " might not perform as expected on these columns." in caplog.text + ) + + def test_post_office_state(): mystrax = strax.Context( storage=[],