From 8a9308ad77a69d90f50ae0f33fef0fbcb65fffc7 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 15 Apr 2024 14:14:47 +0200 Subject: [PATCH] FEAT-#7180: Add type hints for modin.pandas.[functions] (#7181) Signed-off-by: Anatoly Myachev --- modin/pandas/general.py | 38 ++++++++++++++++----------- modin/pandas/io.py | 58 ++++++++++++++++++++--------------------- 2 files changed, 52 insertions(+), 44 deletions(-) diff --git a/modin/pandas/general.py b/modin/pandas/general.py index 6ca0b8681cf..aeff9986f35 100644 --- a/modin/pandas/general.py +++ b/modin/pandas/general.py @@ -13,13 +13,15 @@ """Implement pandas general API.""" +from __future__ import annotations + import warnings from typing import Hashable, Iterable, Mapping, Optional, Union import numpy as np import pandas from pandas._libs.lib import NoDefault, no_default -from pandas._typing import DtypeBackend +from pandas._typing import ArrayLike, DtypeBackend, Scalar, npt from pandas.core.dtypes.common import is_list_like from modin.core.storage_formats import BaseQueryCompiler @@ -35,7 +37,9 @@ @_inherit_docstrings(pandas.isna, apilink="pandas.isna") @enable_logging -def isna(obj): # noqa: PR01, RT01, D200 +def isna( + obj, +) -> bool | npt.NDArray[np.bool_] | Series | DataFrame: # noqa: PR01, RT01, D200 """ Detect missing values for an array-like object. """ @@ -50,7 +54,9 @@ def isna(obj): # noqa: PR01, RT01, D200 @_inherit_docstrings(pandas.notna, apilink="pandas.notna") @enable_logging -def notna(obj): # noqa: PR01, RT01, D200 +def notna( + obj, +) -> bool | npt.NDArray[np.bool_] | Series | DataFrame: # noqa: PR01, RT01, D200 """ Detect non-missing values for an array-like object. """ @@ -79,7 +85,7 @@ def merge( copy: Optional[bool] = None, indicator: bool = False, validate=None, -): # noqa: PR01, RT01, D200 +) -> DataFrame: # noqa: PR01, RT01, D200 """ Merge DataFrame or named Series objects with a database-style join. """ @@ -232,7 +238,7 @@ def pivot_table( margins_name="All", observed=no_default, sort=True, -): +) -> DataFrame: if not isinstance(data, DataFrame): raise ValueError( "can not create pivot table with instance of type {}".format(type(data)) @@ -256,7 +262,7 @@ def pivot_table( @enable_logging def pivot( data, *, columns, index=no_default, values=no_default -): # noqa: PR01, RT01, D200 +) -> DataFrame: # noqa: PR01, RT01, D200 """ Return reshaped DataFrame organized by given index / column values. """ @@ -272,7 +278,7 @@ def to_numeric( errors="raise", downcast=None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, -): # noqa: PR01, RT01, D200 +) -> Scalar | np.ndarray | Series: # noqa: PR01, RT01, D200 """ Convert argument to a numeric type. """ @@ -360,7 +366,7 @@ def _wrap_in_series_object(qc_result): @_inherit_docstrings(pandas.unique, apilink="pandas.unique") @enable_logging -def unique(values): # noqa: PR01, RT01, D200 +def unique(values) -> ArrayLike: # noqa: PR01, RT01, D200 """ Return unique values based on a hash table. """ @@ -371,7 +377,7 @@ def unique(values): # noqa: PR01, RT01, D200 @enable_logging def value_counts( values, sort=True, ascending=False, normalize=False, bins=None, dropna=True -): +) -> Series: """ Compute a histogram of the counts of non-null values. @@ -423,7 +429,7 @@ def concat( verify_integrity: bool = False, sort: bool = False, copy: Optional[bool] = None, -) -> "DataFrame | Series": # noqa: PR01, RT01, D200 +) -> DataFrame | Series: # noqa: PR01, RT01, D200 """ Concatenate Modin objects along a particular axis. """ @@ -572,7 +578,7 @@ def to_datetime( infer_datetime_format=no_default, origin="unix", cache=True, -): # noqa: PR01, RT01, D200 +) -> Scalar | ArrayLike | Series | DataFrame: # noqa: PR01, RT01, D200 """ Convert argument to datetime. """ @@ -615,7 +621,7 @@ def get_dummies( sparse=False, drop_first=False, dtype=None, -): # noqa: PR01, RT01, D200 +) -> DataFrame: # noqa: PR01, RT01, D200 """ Convert categorical variable into dummy/indicator variables. """ @@ -663,7 +669,7 @@ def melt( value_name="value", col_level=None, ignore_index: bool = True, -): # noqa: PR01, RT01, D200 +) -> DataFrame: # noqa: PR01, RT01, D200 """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. """ @@ -712,7 +718,7 @@ def crosstab( # Adding docstring since pandas docs don't have web section for this function. @enable_logging -def lreshape(data: DataFrame, groups, dropna=True): +def lreshape(data: DataFrame, groups, dropna=True) -> DataFrame: """ Reshape wide-format data to long. Generalized inverse of ``DataFrame.pivot``. @@ -796,7 +802,9 @@ def get_names(obj): @_inherit_docstrings(pandas.to_datetime, apilink="pandas.to_timedelta") @enable_logging -def to_timedelta(arg, unit=None, errors="raise"): # noqa: PR01, RT01, D200 +def to_timedelta( + arg, unit=None, errors="raise" +) -> Scalar | pandas.Index | Series: # noqa: PR01, RT01, D200 """ Convert argument to timedelta. diff --git a/modin/pandas/io.py b/modin/pandas/io.py index 02fd8ba1de2..e29629ebd46 100644 --- a/modin/pandas/io.py +++ b/modin/pandas/io.py @@ -149,7 +149,7 @@ def read_xml( compression: CompressionOptions = "infer", storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, -) -> "DataFrame": +) -> DataFrame: ErrorMessage.default_to_pandas("read_xml") _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) return ModinObjects.DataFrame(pandas.read_xml(**kwargs)) @@ -217,7 +217,7 @@ def read_csv( float_precision: Literal["high", "legacy"] | None = None, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, -) -> "DataFrame" | TextFileReader: +) -> DataFrame | TextFileReader: # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_csv_signature = { val.name for val in inspect.signature(pandas.read_csv).parameters.values() @@ -289,7 +289,7 @@ def read_table( float_precision: str | None = None, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, -) -> "DataFrame" | TextFileReader: +) -> DataFrame | TextFileReader: # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args _pd_read_table_signature = { val.name for val in inspect.signature(pandas.read_table).parameters.values() @@ -314,7 +314,7 @@ def read_parquet( filesystem=None, filters=None, **kwargs, -) -> "DataFrame": +) -> DataFrame: from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher if engine == "fastparquet" and dtype_backend is not no_default: @@ -360,7 +360,7 @@ def read_json( storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, engine="ujson", -) -> "DataFrame" | "Series" | pandas.io.json._json.JsonReader: +) -> DataFrame | Series | pandas.io.json._json.JsonReader: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -384,7 +384,7 @@ def read_gbq( use_bqstorage_api: bool | None = None, max_results: int | None = None, progress_bar_type: str | None = None, -) -> "DataFrame": +) -> DataFrame: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) kwargs.update(kwargs.pop("kwargs", {})) @@ -416,7 +416,7 @@ def read_html( extract_links: Literal[None, "header", "footer", "body", "all"] = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, storage_options: StorageOptions = None, -) -> list["DataFrame"]: # noqa: PR01, RT01, D200 +) -> list[DataFrame]: # noqa: PR01, RT01, D200 """ Read HTML tables into a ``DataFrame`` object. """ @@ -434,7 +434,7 @@ def read_clipboard( sep=r"\s+", dtype_backend: Union[DtypeBackend, NoDefault] = no_default, **kwargs, -): # pragma: no cover # noqa: PR01, RT01, D200 +) -> DataFrame: # pragma: no cover # noqa: PR01, RT01, D200 """ Read text from clipboard and pass to read_csv. """ @@ -482,7 +482,7 @@ def read_excel( storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, engine_kwargs: Optional[dict] = None, -) -> "DataFrame" | dict[IntStrT, "DataFrame"]: +) -> DataFrame | dict[IntStrT, DataFrame]: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -533,7 +533,7 @@ def read_feather( use_threads: bool = True, storage_options: StorageOptions = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, -): +) -> DataFrame: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -560,7 +560,7 @@ def read_stata( iterator: bool = False, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, -) -> "DataFrame" | pandas.io.stata.StataReader: +) -> DataFrame | pandas.io.stata.StataReader: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -580,7 +580,7 @@ def read_sas( chunksize: int | None = None, iterator: bool = False, compression: CompressionOptions = "infer", -) -> "DataFrame" | pandas.io.sas.sasreader.ReaderBase: # noqa: PR01, RT01, D200 +) -> DataFrame | pandas.io.sas.sasreader.ReaderBase: # noqa: PR01, RT01, D200 """ Read SAS files stored as either XPORT or SAS7BDAT format files. """ @@ -606,7 +606,7 @@ def read_pickle( filepath_or_buffer, compression: CompressionOptions = "infer", storage_options: StorageOptions = None, -): +) -> DataFrame | Series: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -629,7 +629,7 @@ def read_sql( chunksize=None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, dtype=None, -): # noqa: PR01, RT01, D200 +) -> DataFrame | Iterator[DataFrame]: # noqa: PR01, RT01, D200 """ Read SQL query or database table into a DataFrame. """ @@ -660,7 +660,7 @@ def read_fwf( iterator: bool = False, chunksize: Optional[int] = None, **kwds, -): # noqa: PR01, RT01, D200 +) -> DataFrame | TextFileReader: # noqa: PR01, RT01, D200 """ Read a table of fixed-width formatted lines into DataFrame. """ @@ -695,7 +695,7 @@ def read_sql_table( columns=None, chunksize=None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, -): # noqa: PR01, RT01, D200 +) -> DataFrame | Iterator[DataFrame]: # noqa: PR01, RT01, D200 """ Read SQL database table into a DataFrame. """ @@ -720,7 +720,7 @@ def read_sql_query( chunksize: int | None = None, dtype: DtypeArg | None = None, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, -) -> "DataFrame" | Iterator["DataFrame"]: +) -> DataFrame | Iterator[DataFrame]: _, _, _, kwargs = inspect.getargvalues(inspect.currentframe()) from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher @@ -761,7 +761,7 @@ def read_spss( usecols: Optional[Sequence[str]] = None, convert_categoricals: bool = True, dtype_backend: Union[DtypeBackend, NoDefault] = no_default, -): # noqa: PR01, RT01, D200 +) -> DataFrame: # noqa: PR01, RT01, D200 """ Load an SPSS file from the file path, returning a DataFrame. """ @@ -788,7 +788,7 @@ def json_normalize( errors: Optional[str] = "raise", sep: str = ".", max_level: Optional[int] = None, -) -> "DataFrame": # noqa: PR01, RT01, D200 +) -> DataFrame: # noqa: PR01, RT01, D200 """ Normalize semi-structured JSON data into a flat table. """ @@ -809,7 +809,7 @@ def read_orc( dtype_backend: Union[DtypeBackend, NoDefault] = no_default, filesystem=None, **kwargs, -) -> "DataFrame": # noqa: PR01, RT01, D200 +) -> DataFrame: # noqa: PR01, RT01, D200 """ Load an ORC object from the file path, returning a DataFrame. """ @@ -948,7 +948,7 @@ def return_handler(*args, **kwargs): return method -def from_non_pandas(df, index, columns, dtype): +def from_non_pandas(df, index, columns, dtype) -> DataFrame | None: """ Convert a non-pandas DataFrame into Modin DataFrame. @@ -976,7 +976,7 @@ def from_non_pandas(df, index, columns, dtype): return new_qc -def from_pandas(df): +def from_pandas(df) -> DataFrame: """ Convert a pandas DataFrame to a Modin DataFrame. @@ -995,7 +995,7 @@ def from_pandas(df): return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df)) -def from_arrow(at): +def from_arrow(at) -> DataFrame: """ Convert an Arrow Table to a Modin DataFrame. @@ -1014,7 +1014,7 @@ def from_arrow(at): return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_arrow(at)) -def from_dataframe(df): +def from_dataframe(df) -> DataFrame: """ Convert a DataFrame implementing the dataframe exchange protocol to a Modin DataFrame. @@ -1035,7 +1035,7 @@ def from_dataframe(df): return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_dataframe(df)) -def from_ray_dataset(ray_obj): +def from_ray_dataset(ray_obj) -> DataFrame: """ Convert a Ray Dataset into Modin DataFrame. @@ -1063,7 +1063,7 @@ def from_ray_dataset(ray_obj): from_ray(ray_obj) -def from_ray(ray_obj): +def from_ray(ray_obj) -> DataFrame: """ Convert a Ray Dataset into Modin DataFrame. @@ -1086,7 +1086,7 @@ def from_ray(ray_obj): return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_ray(ray_obj)) -def from_dask(dask_obj): +def from_dask(dask_obj) -> DataFrame: """ Convert a Dask DataFrame to a Modin DataFrame. @@ -1109,7 +1109,7 @@ def from_dask(dask_obj): return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_dask(dask_obj)) -def to_pandas(modin_obj: SupportsPublicToPandas) -> Any: +def to_pandas(modin_obj: SupportsPublicToPandas) -> DataFrame | Series: """ Convert a Modin DataFrame/Series to a pandas DataFrame/Series. @@ -1134,7 +1134,7 @@ def to_numpy( Parameters ---------- - modin_obj : modin.DataFrame, modin."Series", modin.numpy.array + modin_obj : modin.DataFrame, modin.Series, modin.numpy.array The Modin distributed object to convert. Returns