diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d8818f4e7b0..7cdf175cc7d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -115,6 +115,7 @@ jobs: - run: python -m pytest modin/tests/test_utils.py - run: python -m pytest asv_bench/test/test_utils.py - run: python -m pytest modin/tests/interchange/dataframe_protocol/base + - run: python -m pytest modin/tests/test_dataframe_api_standard.py - run: python -m pytest modin/tests/test_logging.py - uses: ./.github/actions/upload-coverage diff --git a/docs/getting_started/installation.rst b/docs/getting_started/installation.rst index 90ca8a0b7c0..c698e3fb53a 100644 --- a/docs/getting_started/installation.rst +++ b/docs/getting_started/installation.rst @@ -74,6 +74,15 @@ storage formats or for different functionalities of Modin. Here is a list of dep pip install "modin[mpi]" # If you want to use MPI through unidist execution engine + +Consortium Standard-compatible implementation based on Modin +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +.. code-block:: bash + + pip install "modin[consortium-standard]" + + Installing on Google Colab """"""""""""""""""""""""""" diff --git a/environment-dev.yml b/environment-dev.yml index 4d5dad714b0..0e0354ced56 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -61,6 +61,7 @@ dependencies: - isort>=5.12 - pip: + - git+https://github.com/data-apis/dataframe-api-compat.git@main - asv==0.5.1 # no conda package for windows so we install it with pip - connectorx>=0.2.6a4 diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index ab46b00d899..faa015fb6ed 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -67,6 +67,7 @@ _inherit_docstrings, expanduser_path_arg, hashable, + import_optional_dependency, try_cast_to_pandas, ) @@ -2892,6 +2893,23 @@ def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True): nan_as_null=nan_as_null, allow_copy=allow_copy ) + def __dataframe_consortium_standard__( + self, *, api_version: str | None = None + ): # noqa: PR01, RT01 + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of Modin. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + dataframe_api_compat = import_optional_dependency( + "dataframe_api_compat", "implementation" + ) + convert_to_standard_compliant_dataframe = ( + dataframe_api_compat.modin_standard.convert_to_standard_compliant_dataframe + ) + return convert_to_standard_compliant_dataframe(self, api_version=api_version) + @property def attrs(self) -> dict: # noqa: RT01, D200 """ diff --git a/modin/pandas/series.py b/modin/pandas/series.py index c00fd61f614..9f63fad107b 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -34,7 +34,11 @@ from modin.config import PersistentPickle from modin.logging import disable_logging from modin.pandas.io import from_pandas, to_pandas -from modin.utils import MODIN_UNNAMED_SERIES_LABEL, _inherit_docstrings +from modin.utils import ( + MODIN_UNNAMED_SERIES_LABEL, + _inherit_docstrings, + import_optional_dependency, +) from .accessor import CachedAccessor, SparseAccessor from .base import _ATTRS_NO_LOOKUP, BasePandasDataset @@ -222,6 +226,22 @@ def __array__(self, dtype=None) -> np.ndarray: # noqa: PR01, RT01, D200 """ return super(Series, self).__array__(dtype).flatten() + def __column_consortium_standard__( + self, *, api_version: str | None = None + ): # noqa: PR01, RT01 + """ + Provide entry point to the Consortium DataFrame Standard API. + + This is developed and maintained outside of Modin. + Please report any issues to https://github.com/data-apis/dataframe-api-compat. + """ + dataframe_api_compat = import_optional_dependency( + "dataframe_api_compat", "implementation" + ) + return dataframe_api_compat.modin_standard.convert_to_standard_compliant_column( + self, api_version=api_version + ) + def __contains__(self, key: Hashable) -> bool: """ Check if `key` in the `Series.index`. diff --git a/modin/tests/test_dataframe_api_standard.py b/modin/tests/test_dataframe_api_standard.py new file mode 100644 index 00000000000..1a6b8b173ab --- /dev/null +++ b/modin/tests/test_dataframe_api_standard.py @@ -0,0 +1,37 @@ +# Licensed to Modin Development Team under one or more contributor license agreements. +# See the NOTICE file distributed with this work for additional information regarding +# copyright ownership. The Modin Development Team licenses this file to you under the +# Apache License, Version 2.0 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under +# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific language +# governing permissions and limitations under the License. + +import pytest + +import modin.pandas + + +def test_dataframe_api_standard() -> None: + """ + Test some basic methods of the dataframe consortium standard. + + Full testing is done at https://github.com/data-apis/dataframe-api-compat, + this is just to check that the entry point works as expected. + """ + pytest.importorskip("dataframe_api_compat") + df_pd = modin.pandas.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df = df_pd.__dataframe_consortium_standard__() + result_1 = df.get_column_names() + expected_1 = ["a", "b"] + assert result_1 == expected_1 + + ser = modin.pandas.Series([1, 2, 3]) + col = ser.__column_consortium_standard__() + result_2 = col.get_value(1) + expected_2 = 2 + assert result_2 == expected_2 diff --git a/requirements-dev.txt b/requirements-dev.txt index 0fb5fed0a32..8563961b28a 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -35,6 +35,7 @@ tqdm>=4.60.0 numexpr<2.8.5 # Latest modin-spreadsheet with widget fix git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 +git+https://github.com/data-apis/dataframe-api-compat.git@main ## dependencies for making release PyGithub>=1.58.0 diff --git a/requirements/env_hdk.yml b/requirements/env_hdk.yml index 83af0dddeeb..803b61af177 100644 --- a/requirements/env_hdk.yml +++ b/requirements/env_hdk.yml @@ -43,5 +43,6 @@ dependencies: - mypy>=1.0.0 - pip: + - git+https://github.com/data-apis/dataframe-api-compat.git@main # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.1.0 diff --git a/requirements/env_unidist_win.yml b/requirements/env_unidist_win.yml index 88ba9ae2b19..11b4c917e3b 100644 --- a/requirements/env_unidist_win.yml +++ b/requirements/env_unidist_win.yml @@ -54,6 +54,7 @@ dependencies: - pandas-stubs>=2.0.0 - pip: + - git+https://github.com/data-apis/dataframe-api-compat.git@main # Fixes breaking ipywidgets changes, but didn't release yet. - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 - connectorx>=0.2.6a4 diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index e48d19dc501..c6649a892e1 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -45,6 +45,7 @@ dependencies: - flake8-print>=5.0.0 - pip: + - git+https://github.com/data-apis/dataframe-api-compat.git@main - asv==0.5.1 # no conda package for windows - connectorx>=0.2.6a4 diff --git a/setup.py b/setup.py index e5862cae07a..3538be22775 100644 --- a/setup.py +++ b/setup.py @@ -9,12 +9,13 @@ # ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100 ray_deps = ["ray[default]>=2.1.0,!=2.5.0", "pyarrow>=7.0.0"] mpi_deps = ["unidist[mpi]>=0.2.1"] +consortium_standard_deps = ["dataframe-api-compat@git+https://github.com/data-apis/dataframe-api-compat.git@main"] spreadsheet_deps = ["modin-spreadsheet>=0.1.0"] # Currently, Modin does not include `mpi` option in `all`. # Otherwise, installation of modin[all] would fail because # users need to have a working MPI implementation and # certain software installed beforehand. -all_deps = dask_deps + ray_deps + spreadsheet_deps +all_deps = dask_deps + ray_deps + spreadsheet_deps + consortium_standard_deps # Distribute 'modin-autoimport-pandas.pth' along with binary and source distributions. # This file provides the "import pandas before Ray init" feature if specific @@ -62,6 +63,7 @@ def make_distribution(self): "dask": dask_deps, "ray": ray_deps, "mpi": mpi_deps, + "consortium-standard": consortium_standard_deps, "spreadsheet": spreadsheet_deps, "all": all_deps, },