From 8a9308ad77a69d90f50ae0f33fef0fbcb65fffc7 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Mon, 15 Apr 2024 14:14:47 +0200
Subject: [PATCH] FEAT-#7180: Add type hints for modin.pandas.[functions]
 (#7181)

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 modin/pandas/general.py | 38 ++++++++++++++++-----------
 modin/pandas/io.py      | 58 ++++++++++++++++++++---------------------
 2 files changed, 52 insertions(+), 44 deletions(-)

diff --git a/modin/pandas/general.py b/modin/pandas/general.py
index 6ca0b8681cf..aeff9986f35 100644
--- a/modin/pandas/general.py
+++ b/modin/pandas/general.py
@@ -13,13 +13,15 @@
 
 """Implement pandas general API."""
 
+from __future__ import annotations
+
 import warnings
 from typing import Hashable, Iterable, Mapping, Optional, Union
 
 import numpy as np
 import pandas
 from pandas._libs.lib import NoDefault, no_default
-from pandas._typing import DtypeBackend
+from pandas._typing import ArrayLike, DtypeBackend, Scalar, npt
 from pandas.core.dtypes.common import is_list_like
 
 from modin.core.storage_formats import BaseQueryCompiler
@@ -35,7 +37,9 @@
 
 @_inherit_docstrings(pandas.isna, apilink="pandas.isna")
 @enable_logging
-def isna(obj):  # noqa: PR01, RT01, D200
+def isna(
+    obj,
+) -> bool | npt.NDArray[np.bool_] | Series | DataFrame:  # noqa: PR01, RT01, D200
     """
     Detect missing values for an array-like object.
     """
@@ -50,7 +54,9 @@ def isna(obj):  # noqa: PR01, RT01, D200
 
 @_inherit_docstrings(pandas.notna, apilink="pandas.notna")
 @enable_logging
-def notna(obj):  # noqa: PR01, RT01, D200
+def notna(
+    obj,
+) -> bool | npt.NDArray[np.bool_] | Series | DataFrame:  # noqa: PR01, RT01, D200
     """
     Detect non-missing values for an array-like object.
     """
@@ -79,7 +85,7 @@ def merge(
     copy: Optional[bool] = None,
     indicator: bool = False,
     validate=None,
-):  # noqa: PR01, RT01, D200
+) -> DataFrame:  # noqa: PR01, RT01, D200
     """
     Merge DataFrame or named Series objects with a database-style join.
     """
@@ -232,7 +238,7 @@ def pivot_table(
     margins_name="All",
     observed=no_default,
     sort=True,
-):
+) -> DataFrame:
     if not isinstance(data, DataFrame):
         raise ValueError(
             "can not create pivot table with instance of type {}".format(type(data))
@@ -256,7 +262,7 @@ def pivot_table(
 @enable_logging
 def pivot(
     data, *, columns, index=no_default, values=no_default
-):  # noqa: PR01, RT01, D200
+) -> DataFrame:  # noqa: PR01, RT01, D200
     """
     Return reshaped DataFrame organized by given index / column values.
     """
@@ -272,7 +278,7 @@ def to_numeric(
     errors="raise",
     downcast=None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
-):  # noqa: PR01, RT01, D200
+) -> Scalar | np.ndarray | Series:  # noqa: PR01, RT01, D200
     """
     Convert argument to a numeric type.
     """
@@ -360,7 +366,7 @@ def _wrap_in_series_object(qc_result):
 
 @_inherit_docstrings(pandas.unique, apilink="pandas.unique")
 @enable_logging
-def unique(values):  # noqa: PR01, RT01, D200
+def unique(values) -> ArrayLike:  # noqa: PR01, RT01, D200
     """
     Return unique values based on a hash table.
     """
@@ -371,7 +377,7 @@ def unique(values):  # noqa: PR01, RT01, D200
 @enable_logging
 def value_counts(
     values, sort=True, ascending=False, normalize=False, bins=None, dropna=True
-):
+) -> Series:
     """
     Compute a histogram of the counts of non-null values.
 
@@ -423,7 +429,7 @@ def concat(
     verify_integrity: bool = False,
     sort: bool = False,
     copy: Optional[bool] = None,
-) -> "DataFrame | Series":  # noqa: PR01, RT01, D200
+) -> DataFrame | Series:  # noqa: PR01, RT01, D200
     """
     Concatenate Modin objects along a particular axis.
     """
@@ -572,7 +578,7 @@ def to_datetime(
     infer_datetime_format=no_default,
     origin="unix",
     cache=True,
-):  # noqa: PR01, RT01, D200
+) -> Scalar | ArrayLike | Series | DataFrame:  # noqa: PR01, RT01, D200
     """
     Convert argument to datetime.
     """
@@ -615,7 +621,7 @@ def get_dummies(
     sparse=False,
     drop_first=False,
     dtype=None,
-):  # noqa: PR01, RT01, D200
+) -> DataFrame:  # noqa: PR01, RT01, D200
     """
     Convert categorical variable into dummy/indicator variables.
     """
@@ -663,7 +669,7 @@ def melt(
     value_name="value",
     col_level=None,
     ignore_index: bool = True,
-):  # noqa: PR01, RT01, D200
+) -> DataFrame:  # noqa: PR01, RT01, D200
     """
     Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
     """
@@ -712,7 +718,7 @@ def crosstab(
 
 # Adding docstring since pandas docs don't have web section for this function.
 @enable_logging
-def lreshape(data: DataFrame, groups, dropna=True):
+def lreshape(data: DataFrame, groups, dropna=True) -> DataFrame:
     """
     Reshape wide-format data to long. Generalized inverse of ``DataFrame.pivot``.
 
@@ -796,7 +802,9 @@ def get_names(obj):
 
 @_inherit_docstrings(pandas.to_datetime, apilink="pandas.to_timedelta")
 @enable_logging
-def to_timedelta(arg, unit=None, errors="raise"):  # noqa: PR01, RT01, D200
+def to_timedelta(
+    arg, unit=None, errors="raise"
+) -> Scalar | pandas.Index | Series:  # noqa: PR01, RT01, D200
     """
     Convert argument to timedelta.
 
diff --git a/modin/pandas/io.py b/modin/pandas/io.py
index 02fd8ba1de2..e29629ebd46 100644
--- a/modin/pandas/io.py
+++ b/modin/pandas/io.py
@@ -149,7 +149,7 @@ def read_xml(
     compression: CompressionOptions = "infer",
     storage_options: StorageOptions = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
-) -> "DataFrame":
+) -> DataFrame:
     ErrorMessage.default_to_pandas("read_xml")
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
     return ModinObjects.DataFrame(pandas.read_xml(**kwargs))
@@ -217,7 +217,7 @@ def read_csv(
     float_precision: Literal["high", "legacy"] | None = None,
     storage_options: StorageOptions = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
-) -> "DataFrame" | TextFileReader:
+) -> DataFrame | TextFileReader:
     # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args
     _pd_read_csv_signature = {
         val.name for val in inspect.signature(pandas.read_csv).parameters.values()
@@ -289,7 +289,7 @@ def read_table(
     float_precision: str | None = None,
     storage_options: StorageOptions = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
-) -> "DataFrame" | TextFileReader:
+) -> DataFrame | TextFileReader:
     # ISSUE #2408: parse parameter shared with pandas read_csv and read_table and update with provided args
     _pd_read_table_signature = {
         val.name for val in inspect.signature(pandas.read_table).parameters.values()
@@ -314,7 +314,7 @@ def read_parquet(
     filesystem=None,
     filters=None,
     **kwargs,
-) -> "DataFrame":
+) -> DataFrame:
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
 
     if engine == "fastparquet" and dtype_backend is not no_default:
@@ -360,7 +360,7 @@ def read_json(
     storage_options: StorageOptions = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
     engine="ujson",
-) -> "DataFrame" | "Series" | pandas.io.json._json.JsonReader:
+) -> DataFrame | Series | pandas.io.json._json.JsonReader:
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
 
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
@@ -384,7 +384,7 @@ def read_gbq(
     use_bqstorage_api: bool | None = None,
     max_results: int | None = None,
     progress_bar_type: str | None = None,
-) -> "DataFrame":
+) -> DataFrame:
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
     kwargs.update(kwargs.pop("kwargs", {}))
 
@@ -416,7 +416,7 @@ def read_html(
     extract_links: Literal[None, "header", "footer", "body", "all"] = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
     storage_options: StorageOptions = None,
-) -> list["DataFrame"]:  # noqa: PR01, RT01, D200
+) -> list[DataFrame]:  # noqa: PR01, RT01, D200
     """
     Read HTML tables into a ``DataFrame`` object.
     """
@@ -434,7 +434,7 @@ def read_clipboard(
     sep=r"\s+",
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
     **kwargs,
-):  # pragma: no cover  # noqa: PR01, RT01, D200
+) -> DataFrame:  # pragma: no cover  # noqa: PR01, RT01, D200
     """
     Read text from clipboard and pass to read_csv.
     """
@@ -482,7 +482,7 @@ def read_excel(
     storage_options: StorageOptions = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
     engine_kwargs: Optional[dict] = None,
-) -> "DataFrame" | dict[IntStrT, "DataFrame"]:
+) -> DataFrame | dict[IntStrT, DataFrame]:
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
 
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
@@ -533,7 +533,7 @@ def read_feather(
     use_threads: bool = True,
     storage_options: StorageOptions = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
-):
+) -> DataFrame:
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
 
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
@@ -560,7 +560,7 @@ def read_stata(
     iterator: bool = False,
     compression: CompressionOptions = "infer",
     storage_options: StorageOptions = None,
-) -> "DataFrame" | pandas.io.stata.StataReader:
+) -> DataFrame | pandas.io.stata.StataReader:
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
 
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
@@ -580,7 +580,7 @@ def read_sas(
     chunksize: int | None = None,
     iterator: bool = False,
     compression: CompressionOptions = "infer",
-) -> "DataFrame" | pandas.io.sas.sasreader.ReaderBase:  # noqa: PR01, RT01, D200
+) -> DataFrame | pandas.io.sas.sasreader.ReaderBase:  # noqa: PR01, RT01, D200
     """
     Read SAS files stored as either XPORT or SAS7BDAT format files.
     """
@@ -606,7 +606,7 @@ def read_pickle(
     filepath_or_buffer,
     compression: CompressionOptions = "infer",
     storage_options: StorageOptions = None,
-):
+) -> DataFrame | Series:
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
 
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
@@ -629,7 +629,7 @@ def read_sql(
     chunksize=None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
     dtype=None,
-):  # noqa: PR01, RT01, D200
+) -> DataFrame | Iterator[DataFrame]:  # noqa: PR01, RT01, D200
     """
     Read SQL query or database table into a DataFrame.
     """
@@ -660,7 +660,7 @@ def read_fwf(
     iterator: bool = False,
     chunksize: Optional[int] = None,
     **kwds,
-):  # noqa: PR01, RT01, D200
+) -> DataFrame | TextFileReader:  # noqa: PR01, RT01, D200
     """
     Read a table of fixed-width formatted lines into DataFrame.
     """
@@ -695,7 +695,7 @@ def read_sql_table(
     columns=None,
     chunksize=None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
-):  # noqa: PR01, RT01, D200
+) -> DataFrame | Iterator[DataFrame]:  # noqa: PR01, RT01, D200
     """
     Read SQL database table into a DataFrame.
     """
@@ -720,7 +720,7 @@ def read_sql_query(
     chunksize: int | None = None,
     dtype: DtypeArg | None = None,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
-) -> "DataFrame" | Iterator["DataFrame"]:
+) -> DataFrame | Iterator[DataFrame]:
     _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
 
     from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher
@@ -761,7 +761,7 @@ def read_spss(
     usecols: Optional[Sequence[str]] = None,
     convert_categoricals: bool = True,
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
-):  # noqa: PR01, RT01, D200
+) -> DataFrame:  # noqa: PR01, RT01, D200
     """
     Load an SPSS file from the file path, returning a DataFrame.
     """
@@ -788,7 +788,7 @@ def json_normalize(
     errors: Optional[str] = "raise",
     sep: str = ".",
     max_level: Optional[int] = None,
-) -> "DataFrame":  # noqa: PR01, RT01, D200
+) -> DataFrame:  # noqa: PR01, RT01, D200
     """
     Normalize semi-structured JSON data into a flat table.
     """
@@ -809,7 +809,7 @@ def read_orc(
     dtype_backend: Union[DtypeBackend, NoDefault] = no_default,
     filesystem=None,
     **kwargs,
-) -> "DataFrame":  # noqa: PR01, RT01, D200
+) -> DataFrame:  # noqa: PR01, RT01, D200
     """
     Load an ORC object from the file path, returning a DataFrame.
     """
@@ -948,7 +948,7 @@ def return_handler(*args, **kwargs):
         return method
 
 
-def from_non_pandas(df, index, columns, dtype):
+def from_non_pandas(df, index, columns, dtype) -> DataFrame | None:
     """
     Convert a non-pandas DataFrame into Modin DataFrame.
 
@@ -976,7 +976,7 @@ def from_non_pandas(df, index, columns, dtype):
     return new_qc
 
 
-def from_pandas(df):
+def from_pandas(df) -> DataFrame:
     """
     Convert a pandas DataFrame to a Modin DataFrame.
 
@@ -995,7 +995,7 @@ def from_pandas(df):
     return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_pandas(df))
 
 
-def from_arrow(at):
+def from_arrow(at) -> DataFrame:
     """
     Convert an Arrow Table to a Modin DataFrame.
 
@@ -1014,7 +1014,7 @@ def from_arrow(at):
     return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_arrow(at))
 
 
-def from_dataframe(df):
+def from_dataframe(df) -> DataFrame:
     """
     Convert a DataFrame implementing the dataframe exchange protocol to a Modin DataFrame.
 
@@ -1035,7 +1035,7 @@ def from_dataframe(df):
     return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_dataframe(df))
 
 
-def from_ray_dataset(ray_obj):
+def from_ray_dataset(ray_obj) -> DataFrame:
     """
     Convert a Ray Dataset into Modin DataFrame.
 
@@ -1063,7 +1063,7 @@ def from_ray_dataset(ray_obj):
     from_ray(ray_obj)
 
 
-def from_ray(ray_obj):
+def from_ray(ray_obj) -> DataFrame:
     """
     Convert a Ray Dataset into Modin DataFrame.
 
@@ -1086,7 +1086,7 @@ def from_ray(ray_obj):
     return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_ray(ray_obj))
 
 
-def from_dask(dask_obj):
+def from_dask(dask_obj) -> DataFrame:
     """
     Convert a Dask DataFrame to a Modin DataFrame.
 
@@ -1109,7 +1109,7 @@ def from_dask(dask_obj):
     return ModinObjects.DataFrame(query_compiler=FactoryDispatcher.from_dask(dask_obj))
 
 
-def to_pandas(modin_obj: SupportsPublicToPandas) -> Any:
+def to_pandas(modin_obj: SupportsPublicToPandas) -> DataFrame | Series:
     """
     Convert a Modin DataFrame/Series to a pandas DataFrame/Series.
 
@@ -1134,7 +1134,7 @@ def to_numpy(
 
     Parameters
     ----------
-    modin_obj : modin.DataFrame, modin."Series", modin.numpy.array
+    modin_obj : modin.DataFrame, modin.Series, modin.numpy.array
         The Modin distributed object to convert.
 
     Returns