From 3e11535c32057c004f0f02ec82adaffa6d9a8bb7 Mon Sep 17 00:00:00 2001 From: Lorenzo Principe <28869147+lorenzomag@users.noreply.github.com> Date: Sun, 1 Sep 2024 23:01:06 -0500 Subject: [PATCH 1/7] Allow get_df on all data_types --- strax/context.py | 12 +++--------- strax/utils.py | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/strax/context.py b/strax/context.py index dc1c4864..fbd18e42 100644 --- a/strax/context.py +++ b/strax/context.py @@ -1910,15 +1910,9 @@ def get_df( """ df = self.get_array(run_id, targets, save=save, max_workers=max_workers, **kwargs) - try: - return pd.DataFrame.from_records(df) - except Exception as e: - if "Data must be 1-dimensional" in str(e): - raise ValueError( - f"Cannot load '{targets}' as a dataframe because it has " - "array fields. Please use get_array." - ) - raise + + return strax.convert_structured_array_to_df(df) + def get_zarr( self, diff --git a/strax/utils.py b/strax/utils.py index 95decf85..e4f11088 100644 --- a/strax/utils.py +++ b/strax/utils.py @@ -813,3 +813,23 @@ def convert_tuple_to_list(init_func_input): else: # if not a container, return. i.e. int, float, bytes, str etc. return func_input + +@export +def convert_structured_array_to_df(structured_array): + """ + Convert a structured numpy array to a pandas DataFrame. + Parameters: + structured_array (numpy.ndarray): The structured array to be converted. + Returns: + pandas.DataFrame: The converted DataFrame. + """ + + data_dict = {} + for name in structured_array.dtype.names: + col = structured_array[name] + if col.ndim > 1: + # Convert n-dimensional columns to lists of ndarrays + data_dict[name] = [np.array(row) for row in col] + else: + data_dict[name] = col + return pd.DataFrame(data_dict) \ No newline at end of file From 908fb5d8631bc9091793405e16f65a642210f8ff Mon Sep 17 00:00:00 2001 From: Lorenzo Principe <28869147+lorenzomag@users.noreply.github.com> Date: Mon, 2 Sep 2024 02:05:30 -0500 Subject: [PATCH 2/7] Add basic test for get_df on peaks data --- tests/test_core.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_core.py b/tests/test_core.py index 605275ab..acc9d282 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -32,6 +32,23 @@ def test_core(allow_multiprocess, max_workers, processor): assert bla.dtype == strax.peak_dtype() +@processing_conditions +def test_core_df(allow_multiprocess, max_workers, processor): + """Test that get_df works with N-dimensional data""" + mystrax = strax.Context( + storage=[], + register=[Records, Peaks], + processors=[processor], + allow_multiprocess=allow_multiprocess, + use_per_run_defaults=True, + ) + + df = mystrax.get_df(run_id=run_id, targets="peaks", max_workers=max_workers) + p = mystrax.get_single_plugin(run_id, "records") + assert len(df.loc[0, "data"]) == 200 + assert len(df) == p.config["recs_per_chunk"] * p.config["n_chunks"] + + def test_post_office_state(): mystrax = strax.Context( storage=[], From de4c778638dfbcb8e25d7f52160d35b5787b0a0b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 2 Sep 2024 07:16:19 +0000 Subject: [PATCH 3/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- strax/context.py | 1 - strax/utils.py | 10 ++++++---- tests/test_core.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/strax/context.py b/strax/context.py index fbd18e42..b022053a 100644 --- a/strax/context.py +++ b/strax/context.py @@ -1913,7 +1913,6 @@ def get_df( return strax.convert_structured_array_to_df(df) - def get_zarr( self, run_ids, diff --git a/strax/utils.py b/strax/utils.py index e4f11088..bd9a491a 100644 --- a/strax/utils.py +++ b/strax/utils.py @@ -814,16 +814,18 @@ def convert_tuple_to_list(init_func_input): # if not a container, return. i.e. int, float, bytes, str etc. return func_input + @export def convert_structured_array_to_df(structured_array): - """ - Convert a structured numpy array to a pandas DataFrame. + """Convert a structured numpy array to a pandas DataFrame. + Parameters: structured_array (numpy.ndarray): The structured array to be converted. Returns: pandas.DataFrame: The converted DataFrame. + """ - + data_dict = {} for name in structured_array.dtype.names: col = structured_array[name] @@ -832,4 +834,4 @@ def convert_structured_array_to_df(structured_array): data_dict[name] = [np.array(row) for row in col] else: data_dict[name] = col - return pd.DataFrame(data_dict) \ No newline at end of file + return pd.DataFrame(data_dict) diff --git a/tests/test_core.py b/tests/test_core.py index acc9d282..62d14c9b 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -34,7 +34,7 @@ def test_core(allow_multiprocess, max_workers, processor): @processing_conditions def test_core_df(allow_multiprocess, max_workers, processor): - """Test that get_df works with N-dimensional data""" + """Test that get_df works with N-dimensional data.""" mystrax = strax.Context( storage=[], register=[Records, Peaks], From 38c48deada47c882d7b2084552a800fdb37d1391 Mon Sep 17 00:00:00 2001 From: Lorenzo Principe <28869147+lorenzomag@users.noreply.github.com> Date: Thu, 19 Sep 2024 23:30:56 -0500 Subject: [PATCH 4/7] Add warning message for non-scalar DataFrame entries --- strax/context.py | 2 +- strax/utils.py | 20 +++++++++++++++++--- tests/test_core.py | 5 +++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/strax/context.py b/strax/context.py index e83c0d9e..193353e8 100644 --- a/strax/context.py +++ b/strax/context.py @@ -1909,7 +1909,7 @@ def get_df( """ df = self.get_array(run_id, targets, save=save, max_workers=max_workers, **kwargs) - return strax.convert_structured_array_to_df(df) + return strax.convert_structured_array_to_df(df, log=self.log) def get_zarr( self, diff --git a/strax/utils.py b/strax/utils.py index 01af8667..df13d9bf 100644 --- a/strax/utils.py +++ b/strax/utils.py @@ -807,9 +807,9 @@ def convert_tuple_to_list(init_func_input): @export -def convert_structured_array_to_df(structured_array): - """Convert a structured numpy array to a pandas DataFrame. - +def convert_structured_array_to_df(structured_array, log=None): + """ + Convert a structured numpy array to a pandas DataFrame. Parameters: structured_array (numpy.ndarray): The structured array to be converted. Returns: @@ -817,12 +817,26 @@ def convert_structured_array_to_df(structured_array): """ + if log is None: + import logging + + log = logging.getLogger("strax_array_to_df") + data_dict = {} + converted_cols = [] for name in structured_array.dtype.names: col = structured_array[name] if col.ndim > 1: # Convert n-dimensional columns to lists of ndarrays data_dict[name] = [np.array(row) for row in col] + converted_cols.append(name) else: data_dict[name] = col + + if converted_cols: + log.warning( + f"Columns {converted_cols} contain non-scalar entries. " + "Some pandas functions (e.g., groupby, apply) might not perform as expected on these columns." + ) + return pd.DataFrame(data_dict) diff --git a/tests/test_core.py b/tests/test_core.py index 62d14c9b..c09cb907 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -33,8 +33,8 @@ def test_core(allow_multiprocess, max_workers, processor): @processing_conditions -def test_core_df(allow_multiprocess, max_workers, processor): - """Test that get_df works with N-dimensional data.""" +def test_core_df(allow_multiprocess, max_workers, processor, caplog): + """Test that get_df works with N-dimensional data""" mystrax = strax.Context( storage=[], register=[Records, Peaks], @@ -47,6 +47,7 @@ def test_core_df(allow_multiprocess, max_workers, processor): p = mystrax.get_single_plugin(run_id, "records") assert len(df.loc[0, "data"]) == 200 assert len(df) == p.config["recs_per_chunk"] * p.config["n_chunks"] + assert "contain non-scalar entries. Some pandas functions (e.g., groupby, apply) might not perform as expected on these columns." in caplog.text def test_post_office_state(): From 9d06ad11ec3efda9aa05d72abe1492533286e8fd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Sep 2024 04:40:23 +0000 Subject: [PATCH 5/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- strax/utils.py | 6 +++--- tests/test_core.py | 7 +++++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/strax/utils.py b/strax/utils.py index df13d9bf..49971807 100644 --- a/strax/utils.py +++ b/strax/utils.py @@ -808,8 +808,8 @@ def convert_tuple_to_list(init_func_input): @export def convert_structured_array_to_df(structured_array, log=None): - """ - Convert a structured numpy array to a pandas DataFrame. + """Convert a structured numpy array to a pandas DataFrame. + Parameters: structured_array (numpy.ndarray): The structured array to be converted. Returns: @@ -821,7 +821,7 @@ def convert_structured_array_to_df(structured_array, log=None): import logging log = logging.getLogger("strax_array_to_df") - + data_dict = {} converted_cols = [] for name in structured_array.dtype.names: diff --git a/tests/test_core.py b/tests/test_core.py index c09cb907..3ce990b3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -34,7 +34,7 @@ def test_core(allow_multiprocess, max_workers, processor): @processing_conditions def test_core_df(allow_multiprocess, max_workers, processor, caplog): - """Test that get_df works with N-dimensional data""" + """Test that get_df works with N-dimensional data.""" mystrax = strax.Context( storage=[], register=[Records, Peaks], @@ -47,7 +47,10 @@ def test_core_df(allow_multiprocess, max_workers, processor, caplog): p = mystrax.get_single_plugin(run_id, "records") assert len(df.loc[0, "data"]) == 200 assert len(df) == p.config["recs_per_chunk"] * p.config["n_chunks"] - assert "contain non-scalar entries. Some pandas functions (e.g., groupby, apply) might not perform as expected on these columns." in caplog.text + assert ( + "contain non-scalar entries. Some pandas functions (e.g., groupby, apply) might not perform as expected on these columns." + in caplog.text + ) def test_post_office_state(): From b7c981a9ca4e0feaf6ac7a3d1752cde3706a8548 Mon Sep 17 00:00:00 2001 From: Lorenzo Principe <28869147+lorenzomag@users.noreply.github.com> Date: Fri, 20 Sep 2024 00:19:47 -0500 Subject: [PATCH 6/7] Reduce line length --- strax/utils.py | 3 ++- tests/test_core.py | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/strax/utils.py b/strax/utils.py index 49971807..430c0e48 100644 --- a/strax/utils.py +++ b/strax/utils.py @@ -836,7 +836,8 @@ def convert_structured_array_to_df(structured_array, log=None): if converted_cols: log.warning( f"Columns {converted_cols} contain non-scalar entries. " - "Some pandas functions (e.g., groupby, apply) might not perform as expected on these columns." + "Some pandas functions (e.g., groupby, apply) might " + "not perform as expected on these columns." ) return pd.DataFrame(data_dict) diff --git a/tests/test_core.py b/tests/test_core.py index 3ce990b3..9590a8b9 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -34,6 +34,7 @@ def test_core(allow_multiprocess, max_workers, processor): @processing_conditions def test_core_df(allow_multiprocess, max_workers, processor, caplog): + """Test that get_df works with N-dimensional data.""" """Test that get_df works with N-dimensional data.""" mystrax = strax.Context( storage=[], @@ -48,7 +49,8 @@ def test_core_df(allow_multiprocess, max_workers, processor, caplog): assert len(df.loc[0, "data"]) == 200 assert len(df) == p.config["recs_per_chunk"] * p.config["n_chunks"] assert ( - "contain non-scalar entries. Some pandas functions (e.g., groupby, apply) might not perform as expected on these columns." + "contain non-scalar entries. Some pandas functions (e.g., groupby, apply)" + " might not perform as expected on these columns." in caplog.text ) From 471b2357c6441601d0e43b93fe26992e038f2d2e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 20 Sep 2024 05:22:22 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_core.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_core.py b/tests/test_core.py index 9590a8b9..fbee3bc4 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -50,8 +50,7 @@ def test_core_df(allow_multiprocess, max_workers, processor, caplog): assert len(df) == p.config["recs_per_chunk"] * p.config["n_chunks"] assert ( "contain non-scalar entries. Some pandas functions (e.g., groupby, apply)" - " might not perform as expected on these columns." - in caplog.text + " might not perform as expected on these columns." in caplog.text )