From 82d7b2c9e19b0eb6c192c51fcffe4505d977e865 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 12 Oct 2023 18:26:45 +0200 Subject: [PATCH 1/2] PERF-#0000: avoid label synchronization for 'dot' operation Signed-off-by: Anatoly Myachev --- .../storage_formats/pandas/query_compiler.py | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index aef305d6c17..62a3e2ed241 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2231,14 +2231,6 @@ def dot(self, other, squeeze_self=None, squeeze_other=None): else other.to_pandas() ) - def map_func(df, other=other, squeeze_self=squeeze_self): # pragma: no cover - """Compute matrix multiplication of the passed frames.""" - result = df.squeeze(axis=1).dot(other) if squeeze_self else df.dot(other) - if is_list_like(result): - return pandas.DataFrame(result) - else: - return pandas.DataFrame([result]) - num_cols = other.shape[1] if len(other.shape) > 1 else 1 if len(self.columns) == 1: new_index = ( @@ -2255,8 +2247,33 @@ def map_func(df, other=other, squeeze_self=squeeze_self): # pragma: no cover new_columns = [MODIN_UNNAMED_SERIES_LABEL] if num_cols == 1 else None axis = 1 + align_index = isinstance(new_index, list) and new_index == [ + MODIN_UNNAMED_SERIES_LABEL + ] + align_columns = new_columns == [MODIN_UNNAMED_SERIES_LABEL] + + def map_func(df, other=other, squeeze_self=squeeze_self): # pragma: no cover + """Compute matrix multiplication of the passed frames.""" + result = df.squeeze(axis=1).dot(other) if squeeze_self else df.dot(other) + + if is_list_like(result): + res = pandas.DataFrame(result) + else: + res = pandas.DataFrame([result]) + + # manual aligning with external index to avoid `sync_labels` overhead + if align_columns: + res.columns = [MODIN_UNNAMED_SERIES_LABEL] + if align_index: + res.index = [MODIN_UNNAMED_SERIES_LABEL] + return res + new_modin_frame = self._modin_frame.apply_full_axis( - axis, map_func, new_index=new_index, new_columns=new_columns + axis, + map_func, + new_index=new_index, + new_columns=new_columns, + sync_labels=False, ) return self.__constructor__(new_modin_frame) From 01fb7221c0d4ea432e2cb9ec4fc9bd027e0f503c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 16 Oct 2023 14:26:52 +0200 Subject: [PATCH 2/2] Update modin/core/storage_formats/pandas/query_compiler.py Co-authored-by: Dmitry Chigarev --- modin/core/storage_formats/pandas/query_compiler.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index 62a3e2ed241..0fc992bb05b 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -2247,6 +2247,11 @@ def dot(self, other, squeeze_self=None, squeeze_other=None): new_columns = [MODIN_UNNAMED_SERIES_LABEL] if num_cols == 1 else None axis = 1 + # If either new index or new columns are supposed to be a single-dimensional, + # then we use a special labeling for them. Besides setting the new labels as + # a metadata to the resulted frame, we also want to set them inside the kernel, + # so actual partitions would be labeled accordingly (there's a 'sync_label' + # parameter that can do the same, but doing it manually is faster) align_index = isinstance(new_index, list) and new_index == [ MODIN_UNNAMED_SERIES_LABEL ]