diff --git a/docs/backends/_utils.py b/docs/backends/_utils.py index 404877f9939d7..e54de2ce120e0 100644 --- a/docs/backends/_utils.py +++ b/docs/backends/_utils.py @@ -17,10 +17,7 @@ def get_renderer(level: int) -> MdRenderer: @cache def get_backend(backend: str): - if backend == "pandas": - return get_object(f"ibis.backends.{backend}", "BasePandasBackend") - else: - return get_object(f"ibis.backends.{backend}", "Backend") + return get_object(f"ibis.backends.{backend}", "Backend") def get_callable(obj, name): diff --git a/docs/backends/pandas.qmd b/docs/backends/pandas.qmd index 042fc08e99629..025039396b51a 100644 --- a/docs/backends/pandas.qmd +++ b/docs/backends/pandas.qmd @@ -1,213 +1,7 @@ # pandas -[https://pandas.pydata.org/](https://pandas.pydata.org/) - -![](https://img.shields.io/badge/memtables-native-green?style=flat-square) ![](https://img.shields.io/badge/inputs-CSV | Parquet-blue?style=flat-square) ![](https://img.shields.io/badge/outputs-CSV | pandas | Parquet | PyArrow-orange?style=flat-square) - -::: {.callout-warning} -## The Pandas backend is slated for removal in Ibis 10.0 -We recommend using one of our other backends. - -Many workloads work well on the DuckDB and Polars backends, for example. -::: - - -## Install - -Install Ibis and dependencies for the pandas backend: - -::: {.panel-tabset} - -## `pip` - -Install with the `pandas` extra: - -```{.bash} -pip install 'ibis-framework[pandas]' -``` - -And connect: - -```{.python} -import ibis - -con = ibis.pandas.connect() # <1> -``` - -1. Adjust connection parameters as needed. - -## `conda` - -Install for pandas: - -```{.bash} -conda install -c conda-forge ibis-pandas -``` - -And connect: - -```{.python} -import ibis - -con = ibis.pandas.connect() # <1> -``` - -1. Adjust connection parameters as needed. - -## `mamba` - -Install for pandas: - -```{.bash} -mamba install -c conda-forge ibis-pandas -``` - -And connect: - -```{.python} -import ibis - -con = ibis.pandas.connect() # <1> -``` - -1. Adjust connection parameters as needed. +::: {.callout-note} +## The pandas backend was removed in Ibis version 10.0 +See [our blog post](../posts/farewell-pandas/index.qmd) on the topic for more information. ::: - - - -## User Defined functions (UDF) - -Ibis supports defining three kinds of user-defined functions for operations on -expressions targeting the pandas backend: **element-wise**, **reduction**, and -**analytic**. - -### Elementwise Functions - -An **element-wise** function is a function that takes N rows as input and -produces N rows of output. `log`, `exp`, and `floor` are examples of -element-wise functions. - -Here's how to define an element-wise function: - -```python -import ibis.expr.datatypes as dt -from ibis.backends.pandas.udf import udf - -@udf.elementwise(input_type=[dt.int64], output_type=dt.double) -def add_one(x): - return x + 1.0 -``` - -### Reduction Functions - -A **reduction** is a function that takes N rows as input and produces 1 row -as output. `sum`, `mean` and `count` are examples of reductions. In -the context of a `GROUP BY`, reductions produce 1 row of output _per -group_. - -Here's how to define a reduction function: - -```python -import ibis.expr.datatypes as dt -from ibis.backends.pandas.udf import udf - -@udf.reduction(input_type=[dt.double], output_type=dt.double) -def double_mean(series): - return 2 * series.mean() -``` - -### Analytic Functions - -An **analytic** function is like an **element-wise** function in that it takes -N rows as input and produces N rows of output. The key difference is that -analytic functions can be applied _per group_ using window functions. Z-score -is an example of an analytic function. - -Here's how to define an analytic function: - -```python -import ibis.expr.datatypes as dt -from ibis.backends.pandas.udf import udf - -@udf.analytic(input_type=[dt.double], output_type=dt.double) -def zscore(series): - return (series - series.mean()) / series.std() -``` - -### Details of pandas UDFs - -- Element-wise provide support - for applying your UDF to any combination of scalar values and columns. -- Reductions provide support for - whole column aggregations, grouped aggregations, and application of your - function over a window. -- Analytic functions work in both grouped and non-grouped - settings -- The objects you receive as input arguments are either `pandas.Series` or - Python/NumPy scalars. - -::: {.callout-warning} -## Keyword arguments must be given a default - -Any keyword arguments must be given a default value or the function **will -not work**. -::: - -A common Python convention is to set the default value to `None` and -handle setting it to something not `None` in the body of the function. - -Using `add_one` from above as an example, the following call will receive a -`pandas.Series` for the `x` argument: - -```python -import ibis -import pandas as pd -df = pd.DataFrame({'a': [1, 2, 3]}) -con = ibis.pandas.connect({'df': df}) -t = con.table('df') -expr = add_one(t.a) -expr -``` - -And this will receive the `int` 1: - -```python -expr = add_one(1) -expr -``` - -Since the pandas backend passes around `**kwargs` you can accept `**kwargs` -in your function: - -```python -import ibis.expr.datatypes as dt -from ibis.backends.pandas.udf import udf - -@udf.elementwise([dt.int64], dt.double) -def add_two(x, **kwargs): # do stuff with kwargs - return x + 2.0 -``` - -Or you can leave them out as we did in the example above. You can also -optionally accept specific keyword arguments. - -For example: - -```python -import ibis.expr.datatypes as dt -from ibis.backends.pandas.udf import udf - -@udf.elementwise([dt.int64], dt.double) -def add_two_with_none(x, y=None): - if y is None: - y = 2.0 - return x + y -``` - -```{python} -#| echo: false -BACKEND = "Pandas" -``` - -{{< include ./_templates/api.qmd >}} diff --git a/docs/backends_sankey.py b/docs/backends_sankey.py index b04cea212fafe..90529a42826c9 100644 --- a/docs/backends_sankey.py +++ b/docs/backends_sankey.py @@ -42,7 +42,7 @@ def to_greyish(hex_code, grey_value=128): "SQLite", "Trino", ], - list(category_colors.keys())[2]: ["Dask", "pandas", "Polars"], + list(category_colors.keys())[2]: ["Polars"], } nodes, links = [], [] diff --git a/ibis/backends/conftest.py b/ibis/backends/conftest.py index 76ae93e241e6e..7b4ef991fee82 100644 --- a/ibis/backends/conftest.py +++ b/ibis/backends/conftest.py @@ -4,7 +4,6 @@ import importlib import importlib.metadata import itertools -import operator from functools import cache from pathlib import Path from typing import TYPE_CHECKING, Any @@ -12,7 +11,6 @@ import _pytest import pytest from packaging.requirements import Requirement -from packaging.version import parse as vparse import ibis from ibis import util @@ -30,22 +28,6 @@ from ibis.backends.tests.base import BackendTest -def compare_versions(module_name, given_version, op): - try: - current_version = importlib.metadata.version(module_name) - return op(vparse(current_version), vparse(given_version)) - except importlib.metadata.PackageNotFoundError: - return False - - -def is_newer_than(module_name, given_version): - return compare_versions(module_name, given_version, operator.gt) - - -def is_older_than(module_name, given_version): - return compare_versions(module_name, given_version, operator.lt) - - TEST_TABLES = { "functional_alltypes": ibis.schema( { @@ -486,7 +468,7 @@ def _setup_backend(request, data_dir, tmp_path_factory, worker_id): @pytest.fixture( - params=_get_backends_to_test(discard=("pandas",)), + params=_get_backends_to_test(), scope="session", ) def ddl_backend(request, data_dir, tmp_path_factory, worker_id): @@ -501,7 +483,7 @@ def ddl_con(ddl_backend): @pytest.fixture( - params=_get_backends_to_test(keep=("pandas", "pyspark")), + params=_get_backends_to_test(keep=("pyspark",)), scope="session", ) def udf_backend(request, data_dir, tmp_path_factory, worker_id): diff --git a/ibis/backends/pandas/__init__.py b/ibis/backends/pandas/__init__.py deleted file mode 100644 index 1ac0ce2323af3..0000000000000 --- a/ibis/backends/pandas/__init__.py +++ /dev/null @@ -1,369 +0,0 @@ -from __future__ import annotations - -import warnings -from functools import lru_cache -from typing import TYPE_CHECKING, Any - -import pandas as pd -import pyarrow as pa -import pyarrow_hotfix # noqa: F401 - -import ibis.common.exceptions as com -import ibis.config -import ibis.expr.operations as ops -import ibis.expr.schema as sch -import ibis.expr.types as ir -from ibis import util -from ibis.backends import BaseBackend, NoUrl -from ibis.common.dispatch import lazy_singledispatch -from ibis.formats.pandas import PandasData, PandasSchema -from ibis.formats.pyarrow import PyArrowData - -if TYPE_CHECKING: - import pathlib - from collections.abc import Mapping, MutableMapping - - -class BasePandasBackend(BaseBackend, NoUrl): - """Base class for backends based on pandas.""" - - name = "pandas" - dialect = None - backend_table_type = pd.DataFrame - - class Options(ibis.config.Config): - enable_trace: bool = False - - def do_connect( - self, - dictionary: MutableMapping[str, pd.DataFrame] | None = None, - ) -> None: - """Construct a client from a dictionary of pandas DataFrames. - - Parameters - ---------- - dictionary - An optional mapping of string table names to pandas DataFrames. - - Examples - -------- - >>> import ibis - >>> ibis.pandas.connect({"t": pd.DataFrame({"a": [1, 2, 3]})}) # doctest: +ELLIPSIS - - """ - warnings.warn( - f"The {self.name} backend is slated for removal in 10.0.", - DeprecationWarning, - ) - self.dictionary = dictionary or {} - self.schemas: MutableMapping[str, sch.Schema] = {} - - def disconnect(self) -> None: - pass - - def from_dataframe( - self, - df: pd.DataFrame, - name: str = "df", - client: BasePandasBackend | None = None, - ) -> ir.Table: - """Construct an ibis table from a pandas DataFrame. - - Parameters - ---------- - df - A pandas DataFrame - name - The name of the pandas DataFrame - client - Client dictionary will be mutated with the name of the DataFrame, - if not provided a new client is created - - Returns - ------- - Table - A table expression - - """ - if client is None: - return self.connect({name: df}).table(name) - client.dictionary[name] = df - return client.table(name) - - def read_csv( - self, source: str | pathlib.Path, table_name: str | None = None, **kwargs: Any - ): - """Register a CSV file as a table in the current session. - - Parameters - ---------- - source - The data source. Can be a local or remote file, pathlike objects - also accepted. - table_name - An optional name to use for the created table. This defaults to - a generated name. - **kwargs - Additional keyword arguments passed to Pandas loading function. - See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html - for more information. - - Returns - ------- - ir.Table - The just-registered table - - """ - table_name = table_name or util.gen_name("read_csv") - df = pd.read_csv(source, **kwargs) - self.dictionary[table_name] = df - return self.table(table_name) - - def read_parquet( - self, source: str | pathlib.Path, table_name: str | None = None, **kwargs: Any - ): - """Register a parquet file as a table in the current session. - - Parameters - ---------- - source - The data source(s). May be a path to a file, an iterable of files, - or directory of parquet files. - table_name - An optional name to use for the created table. This defaults to - a generated name. - **kwargs - Additional keyword arguments passed to Pandas loading function. - See https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html - for more information. - - Returns - ------- - ir.Table - The just-registered table - - """ - table_name = table_name or util.gen_name("read_parquet") - df = pd.read_parquet(source, **kwargs) - self.dictionary[table_name] = df - return self.table(table_name) - - @property - def version(self) -> str: - return pd.__version__ - - def list_tables(self, like=None, database=None): - """Return the list of table names in the current database. - - Parameters - ---------- - like - A pattern in Python's regex format. - database - Unused in the pandas backend. - - Returns - ------- - list[str] - The list of the table names that match the pattern `like`. - """ - return self._filter_with_like(list(self.dictionary.keys()), like) - - def table(self, name: str, schema: sch.Schema | None = None): - inferred_schema = self.get_schema(name) - overridden_schema = {**inferred_schema, **(schema or {})} - return ops.DatabaseTable(name, overridden_schema, self).to_expr() - - def get_schema(self, table_name, *, database=None): - try: - schema = self.schemas[table_name] - except KeyError: - df = self.dictionary[table_name] - self.schemas[table_name] = schema = PandasData.infer_table(df) - - return schema - - def compile(self, expr, *args, **kwargs): - return expr - - def create_table( - self, - name: str, - obj: pd.DataFrame | pa.Table | ir.Table | None = None, - *, - schema: sch.Schema | None = None, - database: str | None = None, - temp: bool | None = None, - overwrite: bool = False, - ) -> ir.Table: - """Create a table.""" - if temp: - com.IbisError( - "Passing `temp=True` to the Pandas backend create_table method has no " - "effect: all tables are in memory and temporary." - ) - if database: - com.IbisError( - "Passing `database` to the Pandas backend create_table method has no " - "effect: Pandas cannot set a database." - ) - if obj is None and schema is None: - raise com.IbisError("The schema or obj parameter is required") - if schema is not None: - schema = ibis.schema(schema) - - if obj is not None: - df = self._convert_object(obj) - else: - dtypes = dict(PandasSchema.from_ibis(schema)) - df = pd.DataFrame(columns=dtypes.keys()).astype(dtypes) - - if name in self.dictionary and not overwrite: - raise com.IbisError(f"Cannot overwrite existing table `{name}`") - - self.dictionary[name] = df - - if schema is not None: - self.schemas[name] = schema - return self.table(name) - - def create_view( - self, - name: str, - obj: ir.Table, - *, - database: str | None = None, - overwrite: bool = False, - ) -> ir.Table: - return self.create_table( - name, obj=obj, temp=None, database=database, overwrite=overwrite - ) - - def drop_view(self, name: str, *, force: bool = False) -> None: - self.drop_table(name, force=force) - - def drop_table(self, name: str, *, force: bool = False) -> None: - try: - del self.dictionary[name] - except KeyError: - if not force: - raise com.IbisError(f"Table {name} does not exist") from None - - def _convert_object(self, obj: Any) -> Any: - return _convert_object(obj, self) - - @classmethod - @lru_cache - def _get_operations(cls): - from ibis.backends.pandas.kernels import supported_operations - - return supported_operations - - @classmethod - def has_operation(cls, operation: type[ops.Value]) -> bool: - return operation in cls._get_operations() - - def _drop_cached_table(self, name): - del self.dictionary[name] - - def to_pyarrow( - self, - expr: ir.Expr, - params: Mapping[ir.Scalar, Any] | None = None, - limit: int | str | None = None, - **kwargs: Any, - ) -> pa.Table: - table_expr = expr.as_table() - output = pa.Table.from_pandas( - self.execute(table_expr, params=params, limit=limit, **kwargs) - ) - - # cudf.pandas adds a column with the name `__index_level_0__` (and maybe - # other index level columns) but these aren't part of the known schema - # so we drop them - output = output.drop( - filter(lambda col: col.startswith("__index_level_"), output.column_names) - ) - table = PyArrowData.convert_table(output, table_expr.schema()) - return expr.__pyarrow_result__(table) - - def to_pyarrow_batches( - self, - expr: ir.Expr, - *, - params: Mapping[ir.Scalar, Any] | None = None, - limit: int | str | None = None, - chunk_size: int = 1000000, - **kwargs: Any, - ) -> pa.ipc.RecordBatchReader: - pa = self._import_pyarrow() - pa_table = self.to_pyarrow( - expr.as_table(), params=params, limit=limit, **kwargs - ) - return pa.ipc.RecordBatchReader.from_batches( - pa_table.schema, pa_table.to_batches(max_chunksize=chunk_size) - ) - - -class Backend(BasePandasBackend): - name = "pandas" - - def execute(self, query, params=None, limit="default", **kwargs): - from ibis.backends.pandas.executor import PandasExecutor - - if limit != "default" and limit is not None: - raise ValueError( - "limit parameter to execute is not yet implemented in the " - "pandas backend" - ) - - if not isinstance(query, ir.Expr): - raise TypeError( - f"`query` has type {type(query).__name__!r}, expected ibis.expr.types.Expr" - ) - - params = params or {} - params = {k.op() if isinstance(k, ir.Expr) else k: v for k, v in params.items()} - - return PandasExecutor.execute(query.op(), backend=self, params=params) - - def _create_cached_table(self, name, expr): - return self.create_table(name, expr.execute()) - - def _finalize_memtable(self, name: str) -> None: - """No-op, let Python handle clean up.""" - - -@lazy_singledispatch -def _convert_object(obj: Any, _conn): - raise com.BackendConversionError( - f"Unable to convert {obj.__class__} object " - f"to backend type: {_conn.__class__.backend_table_type}" - ) - - -@_convert_object.register("ibis.expr.types.Table") -def _table(obj, _conn): - if isinstance(op := obj.op(), ops.InMemoryTable): - return op.data.to_frame() - else: - raise com.BackendConversionError( - f"Unable to convert {obj.__class__} object " - f"to backend type: {_conn.__class__.backend_table_type}" - ) - - -@_convert_object.register("polars.DataFrame") -@_convert_object.register("pyarrow.Table") -def _pa_polars(obj, _conn): - return obj.to_pandas() - - -@_convert_object.register("polars.LazyFrame") -def _polars_lazy(obj, _conn): - return obj.collect().to_pandas() - - -@_convert_object.register("pandas.DataFrame") -def _pandas(obj, _conn): - return obj diff --git a/ibis/backends/pandas/convert.py b/ibis/backends/pandas/convert.py deleted file mode 100644 index 95cde053d46cc..0000000000000 --- a/ibis/backends/pandas/convert.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import annotations - -import numpy as np -import pandas as pd -import pandas.api.types as pdt - -import ibis.expr.datatypes as dt -from ibis.formats.pandas import DataMapper, PandasType - - -class PandasConverter(DataMapper): - @classmethod - def convert_scalar(cls, obj, dtype): - series = pd.Series([obj]) - casted = cls.convert_column(series, dtype) - return casted[0] - - @classmethod - def convert_column(cls, obj, dtype): - pandas_type = PandasType.from_ibis(dtype) - - method_name = f"convert_{dtype.__class__.__name__}" - convert_method = getattr(cls, method_name, cls.convert_default) - - return convert_method(obj, dtype, pandas_type) - - @classmethod - def convert_default(cls, s, dtype, pandas_type): - if pandas_type == np.object_: - func = lambda x: x if x is pd.NA else dt.normalize(dtype, x) - return s.map(func, na_action="ignore").astype(pandas_type) - else: - return s.astype(pandas_type) - - @classmethod - def convert_Integer(cls, s, dtype, pandas_type): - if pdt.is_datetime64_any_dtype(s.dtype): - return s.astype("int64").floordiv(int(1e9)).astype(pandas_type) - else: - return s.astype(pandas_type, errors="ignore") - - convert_SignedInteger = convert_UnsignedInteger = convert_Integer - convert_Int64 = convert_Int32 = convert_Int16 = convert_Int8 = convert_SignedInteger - convert_UInt64 = convert_UInt32 = convert_UInt16 = convert_UInt8 = ( - convert_UnsignedInteger - ) - - @classmethod - def convert_Floating(cls, s, dtype, pandas_type): - if pdt.is_datetime64_any_dtype(s.dtype): - return s.astype("int64").floordiv(int(1e9)).astype(pandas_type) - else: - return s.astype(pandas_type, errors="ignore") - - convert_Float64 = convert_Float32 = convert_Float16 = convert_Floating - - @classmethod - def convert_Timestamp(cls, s, dtype, pandas_type): - if isinstance(s.dtype, pd.DatetimeTZDtype): - return s.dt.tz_convert(dtype.timezone) - elif pdt.is_datetime64_dtype(s.dtype): - return s.dt.tz_localize(dtype.timezone) - elif pdt.is_numeric_dtype(s.dtype): - return pd.to_datetime(s, unit="s").dt.tz_localize(dtype.timezone) - else: - try: - return pd.to_datetime(s).dt.tz_convert(dtype.timezone) - except TypeError: - return pd.to_datetime(s).dt.tz_localize(dtype.timezone) - - @classmethod - def convert_Date(cls, s, dtype, pandas_type): - if isinstance(s.dtype, pd.DatetimeTZDtype): - s = s.dt.tz_convert("UTC").dt.tz_localize(None) - elif pdt.is_numeric_dtype(s.dtype): - s = pd.to_datetime(s, unit="D") - else: - s = pd.to_datetime(s).astype(pandas_type, errors="ignore") - - return s.dt.normalize() - - @classmethod - def convert_String(cls, s, dtype, pandas_type): - # TODO(kszucs): should switch to the new pandas string type and convert - # object columns using s.convert_dtypes() method - return s.map(str, na_action="ignore").astype(object) diff --git a/ibis/backends/pandas/executor.py b/ibis/backends/pandas/executor.py deleted file mode 100644 index 4762132866c35..0000000000000 --- a/ibis/backends/pandas/executor.py +++ /dev/null @@ -1,872 +0,0 @@ -from __future__ import annotations - -import operator -from functools import reduce - -import numpy as np -import pandas as pd -from packaging.version import parse as vparse - -import ibis.backends.pandas.kernels as pandas_kernels -import ibis.expr.operations as ops -from ibis.backends.pandas.convert import PandasConverter -from ibis.backends.pandas.helpers import ( - GroupedFrame, - PandasUtils, - RangeFrame, - RowsFrame, - UngroupedFrame, -) -from ibis.backends.pandas.rewrites import ( - PandasAggregate, - PandasAsofJoin, - PandasJoin, - PandasLimit, - PandasRename, - PandasResetIndex, - PandasScalarSubquery, - PandasWindowFrame, - PandasWindowFunction, - plan, -) -from ibis.common.dispatch import Dispatched -from ibis.common.exceptions import ( - OperationNotDefinedError, - UnboundExpressionError, - UnsupportedOperationError, -) -from ibis.formats.pandas import PandasData, PandasType -from ibis.util import any_of, gen_name - -# ruff: noqa: F811 - - -class PandasExecutor(Dispatched, PandasUtils): - name = "pandas" - kernels = pandas_kernels - - @classmethod - def visit(cls, op: ops.Node, **kwargs): - raise OperationNotDefinedError( - f"Operation {op!r} is not implemented for the pandas backend" - ) - - @classmethod - def visit(cls, op: ops.Literal, value, dtype): - if dtype.is_interval(): - value = pd.Timedelta(value, dtype.unit.short) - elif dtype.is_array(): - value = np.array(value) - elif dtype.is_date(): - value = pd.Timestamp(value, tz="UTC").tz_localize(None) - return value - - @classmethod - def visit(cls, op: ops.Field, rel, name): - return rel[name] - - @classmethod - def visit(cls, op: ops.Alias, arg, name): - try: - return arg.rename(name) - except AttributeError: - return arg - - @classmethod - def visit(cls, op: ops.SortKey, expr, ascending, nulls_first): - return expr - - @classmethod - def visit(cls, op: ops.Cast, arg, to): - if arg is None: - return None - elif isinstance(arg, pd.Series): - return PandasConverter.convert_column(arg, to) - else: - return PandasConverter.convert_scalar(arg, to) - - @classmethod - def visit(cls, op: ops.Greatest, arg): - return cls.columnwise(lambda df: df.max(axis=1), arg) - - @classmethod - def visit(cls, op: ops.Least, arg): - return cls.columnwise(lambda df: df.min(axis=1), arg) - - @classmethod - def visit(cls, op: ops.Coalesce, arg): - return cls.columnwise(lambda df: df.bfill(axis=1).iloc[:, 0], arg) - - @classmethod - def visit(cls, op: ops.Value, **operands): - # automatically pick the correct kernel based on the operand types - typ = type(op) - name = op.name - dtype = PandasType.from_ibis(op.dtype) - kwargs = {"operands": operands, "name": name, "dtype": dtype} - - # decimal operations have special implementations - if op.dtype.is_decimal(): - func = cls.kernels.elementwise_decimal[typ] - return cls.elementwise(func, **kwargs) - - # prefer generic implementations if available - if func := cls.kernels.generic.get(typ): - return cls.generic(func, **kwargs) - - if len(operands) < 1: - raise OperationNotDefinedError( - f"No implementation found for operation {typ}" - ) - _, *rest = operands.values() - is_multi_arg = bool(rest) - is_multi_column = any_of(rest, pd.Series) - - if is_multi_column: - if func := cls.kernels.columnwise.get(typ): - return cls.columnwise(func, **kwargs) - elif func := cls.kernels.rowwise.get(typ): - return cls.rowwise(func, **kwargs) - else: - raise OperationNotDefinedError( - "No columnwise or rowwise implementation found for " - f"multi-column operation {typ}" - ) - elif is_multi_arg: - if func := cls.kernels.columnwise.get(typ): - return cls.columnwise(func, **kwargs) - elif func := cls.kernels.serieswise.get(typ): - return cls.serieswise(func, **kwargs) - elif func := cls.kernels.rowwise.get(typ): - return cls.rowwise(func, **kwargs) - elif func := cls.kernels.elementwise.get(typ): - return cls.elementwise(func, **kwargs) - else: - raise OperationNotDefinedError( - "No columnwise, serieswise, rowwise or elementwise " - f"implementation found for multi-argument operation {typ}" - ) - else: # noqa: PLR5501 - if func := cls.kernels.serieswise.get(typ): - return cls.serieswise(func, **kwargs) - elif func := cls.kernels.elementwise.get(typ): - return cls.elementwise(func, **kwargs) - else: - raise OperationNotDefinedError( - "No serieswise or elementwise implementation found for " - f"single-argument operation {typ}" - ) - - @classmethod - def visit(cls, op: ops.IsNan, arg): - try: - return np.isnan(arg) - except (TypeError, ValueError): - # if `arg` contains `None` np.isnan will complain - # so we take advantage of NaN not equaling itself - # to do the correct thing - return arg != arg - - @classmethod - def visit( - cls, op: ops.SearchedCase | ops.SimpleCase, cases, results, default, base=None - ): - if base is not None: - cases = tuple(base == case for case in cases) - cases, _ = cls.asframe(cases, concat=False) - index = cases[0].index - results, _ = cls.asframe(results, concat=False) - out = np.select(cases, results, default) - return pd.Series(out, index=index) - - @classmethod - def visit(cls, op: ops.TimestampTruncate | ops.DateTruncate, arg, unit): - # TODO(kszucs): should use serieswise() - if vparse(pd.__version__) >= vparse("2.2"): - units = {"m": "min"} - else: - units = {"m": "Min", "ms": "L"} - - unit = units.get(unit.short, unit.short) - - if unit in "YQMWD": - return arg.dt.to_period(unit).dt.to_timestamp() - try: - return arg.dt.floor(unit) - except ValueError: - return arg.dt.to_period(unit).dt.to_timestamp() - - @classmethod - def visit(cls, op: ops.IntervalFromInteger, unit, **kwargs): - if unit.short in {"Y", "Q", "M", "W"}: - return cls.elementwise(lambda v: pd.DateOffset(**{unit.plural: v}), kwargs) - else: - return cls.serieswise( - lambda arg: arg.astype(f"timedelta64[{unit.short}]"), kwargs - ) - - @classmethod - def visit(cls, op: ops.BetweenTime, arg, lower_bound, upper_bound): - idx = pd.DatetimeIndex(arg) - if idx.tz is not None: - idx = idx.tz_convert(None) # make naive because times are naive - indexer = idx.indexer_between_time(lower_bound, upper_bound) - result = np.zeros(len(arg), dtype=np.bool_) - result[indexer] = True - return pd.Series(result) - - @classmethod - def visit(cls, op: ops.FindInSet, needle, values): - (needle, *haystack), _ = cls.asframe((needle, *values), concat=False) - condlist = [needle == col for col in haystack] - choicelist = [i for i, _ in enumerate(haystack)] - result = np.select(condlist, choicelist, default=-1) - return pd.Series(result, name=op.name) - - @classmethod - def visit(cls, op: ops.Array, exprs): - return cls.rowwise(lambda row: np.array(row, dtype=object), exprs) - - @classmethod - def visit(cls, op: ops.StructColumn, names, values): - return cls.rowwise(lambda row: dict(zip(names, row)), values) - - @classmethod - def visit(cls, op: ops.ArrayConcat, arg): - return cls.rowwise(lambda row: np.concatenate(row.values), arg) - - @classmethod - def visit(cls, op: ops.Unnest, arg): - arg = cls.asseries(arg) - mask = arg.map(lambda v: bool(len(v)), na_action="ignore") - return arg[mask].explode() - - @classmethod - def visit( - cls, op: ops.ElementWiseVectorizedUDF, func, func_args, input_type, return_type - ): - """Execute an elementwise UDF.""" - - res = func(*func_args) - if isinstance(res, pd.DataFrame): - # it is important otherwise it is going to fill up the memory - res = res.apply(lambda row: row.to_dict(), axis=1) - - return res - - ############################# Reductions ################################## - - @classmethod - def visit(cls, op: ops.Reduction, arg, where, order_by=()): - if order_by: - raise UnsupportedOperationError( - "ordering of order-sensitive aggregations via `order_by` is " - "not supported for this backend" - ) - func = cls.kernels.reductions[type(op)] - return cls.agg(func, arg, where) - - @classmethod - def visit(cls, op: ops.CountStar, arg, where): - def agg(df): - if where is None: - return len(df) - else: - return df[where.name].sum() - - return agg - - @classmethod - def visit(cls, op: ops.CountDistinctStar, arg, where): - def agg(df): - if where is None: - return df.nunique() - else: - return df[where.name].nunique() - - return agg - - @classmethod - def visit(cls, op: ops.Arbitrary, arg, where): - return cls.agg(cls.kernels.reductions[ops.Arbitrary], arg, where) - - @classmethod - def visit(cls, op: ops.ArgMin | ops.ArgMax, arg, key, where): - func = operator.methodcaller(op.__class__.__name__.lower()) - - if where is None: - - def agg(df): - indices = func(df[key.name]) - return df[arg.name].iloc[indices] - else: - - def agg(df): - mask = df[where.name] - filtered = df[mask] - indices = func(filtered[key.name]) - return filtered[arg.name].iloc[indices] - - return agg - - @classmethod - def visit(cls, op: ops.Variance, arg, where, how): - ddof = {"pop": 0, "sample": 1}[how] - return cls.agg(lambda x: x.var(ddof=ddof), arg, where) - - @classmethod - def visit(cls, op: ops.StandardDev, arg, where, how): - ddof = {"pop": 0, "sample": 1}[how] - return cls.agg(lambda x: x.std(ddof=ddof), arg, where) - - @classmethod - def visit(cls, op: ops.ArrayCollect, arg, where, order_by, include_null): - if order_by: - raise UnsupportedOperationError( - "ordering of order-sensitive aggregations via `order_by` is " - "not supported for this backend" - ) - return cls.agg( - (lambda x: x.tolist() if include_null else x.dropna().tolist()), arg, where - ) - - @classmethod - def visit(cls, op: ops.First, arg, where, order_by, include_null): - if order_by: - raise UnsupportedOperationError( - "ordering of order-sensitive aggregations via `order_by` is " - "not supported for this backend" - ) - - def first(arg): - if not include_null: - arg = arg.dropna() - return arg.iat[0] if len(arg) else None - - return cls.agg(first, arg, where) - - @classmethod - def visit(cls, op: ops.Last, arg, where, order_by, include_null): - if order_by: - raise UnsupportedOperationError( - "ordering of order-sensitive aggregations via `order_by` is " - "not supported for this backend" - ) - - def last(arg): - if not include_null: - arg = arg.dropna() - return arg.iat[-1] if len(arg) else None - - return cls.agg(last, arg, where) - - @classmethod - def visit(cls, op: ops.Correlation, left, right, where, how): - if where is None: - - def agg(df): - return df[left.name].corr(df[right.name]) - else: - - def agg(df): - mask = df[where.name] - lhs = df[left.name][mask] - rhs = df[right.name][mask] - return lhs.corr(rhs) - - return agg - - @classmethod - def visit(cls, op: ops.Covariance, left, right, where, how): - ddof = {"pop": 0, "sample": 1}[how] - if where is None: - - def agg(df): - return df[left.name].cov(df[right.name], ddof=ddof) - else: - - def agg(df): - mask = df[where.name] - lhs = df[left.name][mask] - rhs = df[right.name][mask] - return lhs.cov(rhs, ddof=ddof) - - return agg - - @classmethod - def visit(cls, op: ops.GroupConcat, arg, sep, where, order_by): - if order_by: - raise UnsupportedOperationError( - "ordering of order-sensitive aggregations via `order_by` is " - "not supported for this backend" - ) - - if where is None: - - def agg(df): - return sep.join(df[arg.name].astype(str)) - else: - - def agg(df): - mask = df[where.name] - group = df[arg.name][mask] - if group.empty: - return pd.NA - return sep.join(group) - - return agg - - @classmethod - def visit(cls, op: ops.Quantile, arg, quantile, where): - return cls.agg(lambda x: x.quantile(quantile), arg, where) - - @classmethod - def visit(cls, op: ops.MultiQuantile, arg, quantile, where): - return cls.agg(lambda x: list(x.quantile(quantile)), arg, where) - - @classmethod - def visit( - cls, op: ops.ReductionVectorizedUDF, func, func_args, input_type, return_type - ): - def agg(df): - args = [df[col.name] for col in func_args] - return func(*args) - - return agg - - ############################# Analytic #################################### - - @classmethod - def visit(cls, op: ops.RowNumber): - def agg(df, order_keys): - return pd.Series(np.arange(len(df)), index=df.index) - - return agg - - @classmethod - def visit(cls, op: ops.Lag | ops.Lead, arg, offset, default): - if isinstance(op, ops.Lag): - sign = operator.pos - else: - sign = operator.neg - - if op.offset is not None and op.offset.dtype.is_interval(): - - def agg(df, order_keys): - df = df.set_index(order_keys) - col = df[arg.name].shift(freq=sign(offset)) - res = col.reindex(df.index) - if not pd.isnull(default): - res = res.fillna(default) - return res.reset_index(drop=True) - - else: - offset = 1 if offset is None else offset - - def agg(df, order_keys): - res = df[arg.name].shift(sign(offset)) - if not pd.isnull(default): - res = res.fillna(default) - return res - - return agg - - @classmethod - def visit(cls, op: ops.MinRank | ops.DenseRank): - method = "dense" if isinstance(op, ops.DenseRank) else "min" - - def agg(df, order_keys): - if len(order_keys) == 0: - raise ValueError("order_by argument is required for rank functions") - elif len(order_keys) == 1: - s = df[order_keys[0]] - else: - s = df[order_keys].apply(tuple, axis=1) - - return s.rank(method=method).astype("int64") - 1 - - return agg - - @classmethod - def visit(cls, op: ops.PercentRank): - def agg(df, order_keys): - if len(order_keys) == 0: - raise ValueError("order_by argument is required for rank functions") - elif len(order_keys) == 1: - s = df[order_keys[0]] - else: - s = df[order_keys].apply(tuple, axis=1) - - return s.rank(method="min").sub(1).div(len(df) - 1) - - return agg - - @classmethod - def visit(cls, op: ops.CumeDist): - def agg(df, order_keys): - if len(order_keys) == 0: - raise ValueError("order_by argument is required for rank functions") - elif len(order_keys) == 1: - s = df[order_keys[0]] - else: - s = df[order_keys].apply(tuple, axis=1) - - return s.rank(method="average", pct=True) - - return agg - - @classmethod - def visit( - cls, op: ops.AnalyticVectorizedUDF, func, func_args, input_type, return_type - ): - def agg(df, order_keys): - args = [df[col.name] for col in func_args] - res = func(*args) - if isinstance(res, pd.DataFrame): - # it is important otherwise it is going to fill up the memory - res = res.apply(lambda row: row.to_dict(), axis=1) - return res - - return agg - - ############################ Window functions ############################# - - @classmethod - def visit(cls, op: ops.WindowBoundary, value, preceding): - return value - - @classmethod - def visit(cls, op: PandasWindowFrame, table, how, start, end, group_by, order_by): - if start is not None and op.start.preceding: - start = -start - if end is not None and op.end.preceding: - end = -end - - table = table.assign(__start__=start, __end__=end) - - # TODO(kszucs): order by ibis.random() is not supported because it is - # excluded from the group by keys due to its scalar shape - group_keys = [group.name for group in op.group_by] - order_keys = [key.name for key in op.order_by if key.shape.is_columnar()] - ascending = [key.ascending for key in op.order_by if key.shape.is_columnar()] - - if order_by: - table = table.sort_values(order_keys, ascending=ascending, kind="mergesort") - - if group_by: - frame = GroupedFrame(df=table, group_keys=group_keys) - else: - frame = UngroupedFrame(df=table) - - if start is None and end is None: - return frame - elif how == "rows": - return RowsFrame(parent=frame) - elif how == "range": - if len(order_keys) != 1: - raise NotImplementedError( - "Only single column order by is supported for range window frames" - ) - return RangeFrame(parent=frame, order_key=order_keys[0]) - else: - raise NotImplementedError(f"Unsupported window frame type: {how}") - - @classmethod - def visit(cls, op: PandasWindowFunction, func, frame): - if isinstance(op.func, ops.Analytic): - order_keys = [key.name for key in op.frame.order_by] - return frame.apply_analytic(func, order_keys=order_keys) - else: - return frame.apply_reduction(func) - - ############################ Relational ################################### - - @classmethod - def visit(cls, op: ops.DatabaseTable, name, schema, source, namespace): - try: - return source.dictionary[name] - except KeyError: - raise UnboundExpressionError( - f"{name} is not a table in the {source.name!r} backend, you " - "probably tried to execute an expression without a data source" - ) - - @classmethod - def visit(cls, op: ops.InMemoryTable, name, schema, data): - return data.to_frame() - - @classmethod - def visit(cls, op: ops.DummyTable, values): - df, _ = cls.asframe(values) - return df - - @classmethod - def visit(cls, op: ops.Reference, parent, **kwargs): - return parent - - @classmethod - def visit(cls, op: PandasRename, parent, mapping): - return parent.rename(columns=mapping) - - @classmethod - def visit(cls, op: PandasLimit, parent, n, offset): - n = n.iat[0, 0] - offset = offset.iat[0, 0] - if n is None: - return parent.iloc[offset:] - else: - return parent.iloc[offset : offset + n] - - @classmethod - def visit(cls, op: PandasResetIndex, parent): - return parent.reset_index(drop=True) - - @classmethod - def visit(cls, op: ops.Sample, parent, fraction, method, seed): - return parent.sample(frac=fraction, random_state=seed) - - @classmethod - def visit(cls, op: ops.Project, parent, values): - df, all_scalars = cls.asframe(values) - if all_scalars and len(parent) != len(df): - df = cls.concat([df] * len(parent)) - return df - - @classmethod - def visit(cls, op: ops.Filter, parent, predicates): - if predicates: - pred = reduce(operator.and_, predicates) - if len(pred) != len(parent): - raise RuntimeError( - "Selection predicate length does not match underlying table" - ) - parent = parent.loc[pred].reset_index(drop=True) - return parent - - @classmethod - def visit(cls, op: ops.Sort, parent, keys): - # 1. add sort key columns to the dataframe if they are not already present - # 2. sort the dataframe using those columns - # 3. drop the sort key columns - ascending = [key.ascending for key in op.keys] - nulls_first = [key.nulls_first for key in op.keys] - - if all(nulls_first): - na_position = "first" - elif not any(nulls_first): - na_position = "last" - else: - raise ValueError( - "pandas does not support specifying null ordering for individual columns" - ) - - newcols = {gen_name("sort_key"): col for col in keys} - names = list(newcols.keys()) - df = parent.assign(**newcols) - df = df.sort_values( - by=names, - ascending=ascending, - na_position=na_position, - ignore_index=True, - kind="mergesort", - ) - return df.drop(columns=names) - - @classmethod - def visit(cls, op: ops.DropColumns, parent, columns_to_drop): - return parent.drop(columns=list(columns_to_drop)) - - @classmethod - def visit(cls, op: PandasAggregate, parent, groups, metrics): - if groups: - parent = parent.groupby([col.name for col in groups.values()]) - metrics = {k: parent.apply(v) for k, v in metrics.items()} - result = cls.concat(metrics, axis=1).reset_index() - renames = {v.name: k for k, v in op.groups.items()} - return result.rename(columns=renames) - else: - results = {k: v(parent) for k, v in metrics.items()} - combined, _ = cls.asframe(results) - return combined - - @classmethod - def visit(cls, op: PandasJoin, how, left, right, left_on, right_on): - # broadcast predicates if they are scalar values - left_on = [cls.asseries(v, like=left) for v in left_on] - right_on = [cls.asseries(v, like=right) for v in right_on] - - if how == "cross": - assert not left_on and not right_on - return cls.merge(left, right, how="cross") - elif how == "positional": - assert not left_on and not right_on - return cls.concat([left, right], axis=1) - elif how == "anti": - df = cls.merge( - left, - right, - how="outer", - left_on=left_on, - right_on=right_on, - indicator=True, - ) - df = df[df["_merge"] == "left_only"] - return df.drop(columns=["_merge"]) - elif how == "semi": - mask = cls.asseries(True, like=left) - for left_pred, right_pred in zip(left_on, right_on): - mask = mask & left_pred.isin(right_pred) - return left[mask] - else: - left_columns = {gen_name("left"): s for s in left_on} - right_columns = {gen_name("right"): s for s in right_on} - left_keys = list(left_columns.keys()) - right_keys = list(right_columns.keys()) - left = left.assign(**left_columns) - right = right.assign(**right_columns) - df = left.merge(right, how=how, left_on=left_keys, right_on=right_keys) - return df - - @classmethod - def visit( - cls, - op: PandasAsofJoin, - how, - left, - right, - left_on, - right_on, - left_by, - right_by, - operator, - ): - # broadcast predicates if they are scalar values - left_on = [cls.asseries(v, like=left) for v in left_on] - left_by = [cls.asseries(v, like=left) for v in left_by] - right_on = [cls.asseries(v, like=right) for v in right_on] - right_by = [cls.asseries(v, like=right) for v in right_by] - - # merge_asof only works with column names not with series - left_on = {gen_name("left"): s for s in left_on} - left_by = {gen_name("left"): s for s in left_by} - right_on = {gen_name("right"): s for s in right_on} - right_by = {gen_name("right"): s for s in right_by} - - left = left.assign(**left_on, **left_by) - right = right.assign(**right_on, **right_by) - - # construct the appropriate flags for merge_asof - if operator == ops.LessEqual: - direction = "forward" - allow_exact_matches = True - elif operator == ops.GreaterEqual: - direction = "backward" - allow_exact_matches = True - elif operator == ops.Less: - direction = "forward" - allow_exact_matches = False - elif operator == ops.Greater: - direction = "backward" - allow_exact_matches = False - elif operator == ops.Equals: - direction = "nearest" - allow_exact_matches = True - else: - raise NotImplementedError( - f"Operator {operator} not supported for asof join" - ) - - # merge_asof requires the left side to be sorted by the join keys - left = left.sort_values(by=list(left_on.keys())) - df = cls.merge_asof( - left, - right, - left_on=list(left_on.keys()), - right_on=list(right_on.keys()), - left_by=list(left_by.keys()) or None, - right_by=list(right_by.keys()) or None, - direction=direction, - allow_exact_matches=allow_exact_matches, - ) - return df - - @classmethod - def visit(cls, op: ops.Union, left, right, distinct): - result = cls.concat([left, right], axis=0) - return result.drop_duplicates() if distinct else result - - @classmethod - def visit(cls, op: ops.Intersection, left, right, distinct): - if not distinct: - raise NotImplementedError( - "`distinct=False` is not supported by the pandas backend" - ) - return left.merge(right, on=list(left.columns), how="inner") - - @classmethod - def visit(cls, op: ops.Difference, left, right, distinct): - if not distinct: - raise NotImplementedError( - "`distinct=False` is not supported by the pandas backend" - ) - merged = left.merge(right, on=list(left.columns), how="outer", indicator=True) - result = merged[merged["_merge"] == "left_only"].drop("_merge", axis=1) - return result - - @classmethod - def visit(cls, op: ops.Distinct, parent): - return parent.drop_duplicates() - - @classmethod - def visit(cls, op: ops.DropNull, parent, how, subset): - if op.subset is not None: - subset = [col.name for col in op.subset] - else: - subset = None - return parent.dropna(how=how, subset=subset) - - @classmethod - def visit(cls, op: ops.FillNull, parent, replacements): - return parent.fillna(replacements) - - @classmethod - def visit(cls, op: ops.InValues, value, options): - if isinstance(value, pd.Series): - return value.isin(options) - else: - return value in options - - @classmethod - def visit(cls, op: ops.InSubquery, rel, needle): - first_column = rel.iloc[:, 0] - if isinstance(needle, pd.Series): - return needle.isin(first_column) - else: - return needle in first_column - - @classmethod - def visit(cls, op: PandasScalarSubquery, rel): - return rel.iat[0, 0] - - @classmethod - def execute(cls, node, backend, params): - def fn(node, _, **kwargs): - return cls.visit(node, **kwargs) - - original = node - node = node.to_expr().as_table().op() - node = plan(node, backend=backend, params=params) - df = node.map_clear(fn) - - # TODO(kszucs): add a flag to disable this conversion because it can be - # expensive for columns with object dtype - df = PandasData.convert_table(df, node.schema) - if isinstance(original, ops.Value): - if original.shape.is_scalar(): - return df.iloc[0, 0] - elif original.shape.is_columnar(): - return df.iloc[:, 0] - else: - raise TypeError(f"Unexpected shape: {original.shape}") - else: - return df diff --git a/ibis/backends/pandas/helpers.py b/ibis/backends/pandas/helpers.py deleted file mode 100644 index bb8550b7c8bc8..0000000000000 --- a/ibis/backends/pandas/helpers.py +++ /dev/null @@ -1,232 +0,0 @@ -from __future__ import annotations - -import itertools -import math -from typing import TYPE_CHECKING - -import numpy as np -import pandas as pd - -from ibis.util import gen_name - -if TYPE_CHECKING: - from collections.abc import Callable - - -def isnull(obj): - return obj is None or obj is pd.NA or (isinstance(obj, float) and math.isnan(obj)) - - -class PandasUtils: - @classmethod - def merge(cls, *args, **kwargs): - return pd.merge(*args, **kwargs) - - @classmethod - def merge_asof(cls, *args, **kwargs): - return pd.merge_asof(*args, **kwargs) - - @classmethod - def concat(cls, dfs, **kwargs): - return pd.concat(dfs, **kwargs) - - @classmethod - def asseries(cls, value, like=None): - """Ensure that value is a pandas Series object, broadcast if necessary.""" - size = len(like) if like is not None else 1 - if isinstance(value, pd.Series): - return value - elif isinstance(value, (list, np.ndarray)): - return pd.Series(itertools.repeat(np.array(value), size)) - else: - return pd.Series(np.repeat(value, size)) - - @classmethod - def asframe(cls, values: dict | tuple, concat=True): - """Construct a DataFrame from a dict or tuple of Series objects.""" - if isinstance(values, dict): - names, values = zip(*values.items()) - elif isinstance(values, tuple): - names = [f"_{i}" for i in range(len(values))] - else: - raise TypeError(f"values must be a dict, or tuple; got {type(values)}") - - all_scalars = True - representative = None - for v in values: - if isinstance(v, pd.Series): - representative = v - all_scalars = False - break - - columns = [cls.asseries(v, like=representative) for v in values] - if concat: - df = pd.concat(columns, axis=1, keys=names) - return df, all_scalars - else: - return columns, all_scalars - - @classmethod - def agg(cls, func, arg_column, where_column): - if where_column is None: - - def applier(df): - return func(df[arg_column.name]) - else: - - def applier(df): - mask = df[where_column.name] - col = df[arg_column.name][mask] - return func(col) - - return applier - - @classmethod - def generic(cls, func: Callable, operands, **kwargs): - return func(*operands.values()) - - @classmethod - def rowwise(cls, func: Callable, operands, **kwargs): - """Kernel applied to a row, where all the operands are scalars.""" - # dealing with a collection of series objects - df, _ = cls.asframe(operands) - return df.apply(func, axis=1) - - @classmethod - def columnwise(cls, func: Callable, operands, **kwargs): - """Kernel where all the operands are series objects.""" - df, _ = cls.asframe(operands) - return func(df) - - @classmethod - def serieswise(cls, func, operands, **kwargs): - """Kernel where the first operand is a series object.""" - (key, value), *rest = operands.items() - # ensure that the first operand is a series object - value = cls.asseries(value) - operands = {key: value, **dict(rest)} - return func(**operands) - - @classmethod - def elementwise(cls, func, operands, **kwargs): - """Kernel applied to an element, where all the operands are scalars.""" - value = operands.pop(next(iter(operands))) - if isinstance(value, pd.Series): - # dealing with a single series object - if operands: - return value.apply(func, **operands) - else: - return value.map(func, na_action="ignore") - else: - # dealing with a single scalar object - return func(value, **operands) - - -class UngroupedFrame: - def __init__(self, df): - self.df = df - - def groups(self): - yield self.df - - def apply_reduction(self, func, **kwargs): - result = func(self.df, **kwargs) - data = [result] * len(self.df) - return pd.Series(data, index=self.df.index) - - def apply_analytic(self, func, **kwargs): - return func(self.df, **kwargs) - - -class GroupedFrame: - def __init__(self, df, group_keys): - self.df = df - self.group_keys = group_keys - self.groupby = df.groupby(group_keys, as_index=True) - - def groups(self): - for _, df in self.groupby: - yield df - - def apply_analytic(self, func, **kwargs): - results = [func(df, **kwargs) for df in self.groups()] - return pd.concat(results) - - def apply_reduction(self, func, **kwargs): - name = gen_name("result") - result = self.groupby.apply(func, **kwargs).rename(name) - df = self.df.merge(result, left_on=self.group_keys, right_index=True) - return df[name] - - -class RowsFrame: - def __init__(self, parent): - self.parent = parent - - @staticmethod - def adjust(length, index, start_offset, end_offset): - if start_offset is None: - start_index = 0 - else: - start_index = index + start_offset - if start_index < 0: - start_index = 0 - elif start_index > length: - start_index = length - - if end_offset is None: - end_index = length - else: - end_index = index + end_offset + 1 - if end_index < 0: - end_index = 0 - elif end_index > length: - end_index = length - - return (start_index, end_index) - - def apply_analytic(self, func, **kwargs): - return self.parent.apply_analytic(func, **kwargs) - - def apply_reduction(self, func, **kwargs): - results = {} - for df in self.parent.groups(): - for i, (ix, row) in enumerate(df.iterrows()): - # TODO(kszucs): use unique column names for _start, _end - start, end = row["__start__"], row["__end__"] - start_index, end_index = self.adjust(len(df), i, start, end) - subdf = df.iloc[start_index:end_index] - results[ix] = func(subdf, **kwargs) - - return pd.Series(results) - - -class RangeFrame: - def __init__(self, parent, order_key): - self.parent = parent - self.order_key = order_key - - @staticmethod - def predicate(col, i, start, end): - value = col.iat[i] - if start is None: - return col <= value + end - elif end is None: - return col >= value + start - else: - return (col >= value + start) & (col <= value + end) - - def apply_analytic(self, func, **kwargs): - return self.parent.apply_analytic(func, **kwargs) - - def apply_reduction(self, func, **kwargs): - results = {} - for df in self.parent.groups(): - for i, (ix, row) in enumerate(df.iterrows()): - start, end = row["__start__"], row["__end__"] - column = df[self.order_key] - predicate = self.predicate(column, i, start, end) - subdf = df[predicate] - results[ix] = func(subdf, **kwargs) - - return pd.Series(results) diff --git a/ibis/backends/pandas/kernels.py b/ibis/backends/pandas/kernels.py deleted file mode 100644 index 0d2bc1db1de82..0000000000000 --- a/ibis/backends/pandas/kernels.py +++ /dev/null @@ -1,521 +0,0 @@ -from __future__ import annotations - -import datetime -import decimal -import json -import math -import operator - -try: - import regex as re -except ImportError: - import re -from functools import reduce -from urllib.parse import parse_qs, urlsplit - -import numpy as np -import pandas as pd -import toolz - -import ibis.expr.operations as ops -from ibis.backends.pandas.helpers import isnull - - -def substring_rowwise(row): - arg, start, length = row["arg"], row["start"], row["length"] - if isnull(arg): - return None - elif isnull(start): - return None - elif isnull(length): - return arg[start:] - else: - return arg[start : start + length] - - -def substring_serieswise(arg, start, length): - if length is None: - return arg.str[start:] - else: - return arg.str[start : start + length] - - -def _sql_like_to_regex(pattern, escape): - """Convert a SQL `LIKE` pattern to an equivalent Python regular expression. - - Parameters - ---------- - pattern - A LIKE pattern with the following semantics: - * `%` matches zero or more characters - * `_` matches exactly one character - * To escape `%` and `_` (or to match the `escape` parameter - itself), prefix the desired character with `escape`. - escape - Escape character - - Returns - ------- - str - A regular expression pattern equivalent to the input SQL `LIKE` pattern. - - Examples - -------- - >>> sql_like_to_regex("6%") # default is to not escape anything - '^6.*$' - >>> sql_like_to_regex("6^%", escape="^") - '^6%$' - >>> sql_like_to_regex("6_") - '^6.$' - >>> sql_like_to_regex("6/_", escape="/") - '^6_$' - >>> sql_like_to_regex("%abc") # any string ending with "abc" - '^.*abc$' - >>> sql_like_to_regex("abc%") # any string starting with "abc" - '^abc.*$' - - """ - cur_i = 0 - pattern_length = len(pattern) - - while cur_i < pattern_length: - nxt_i = cur_i + 1 - - cur = pattern[cur_i] - nxt = pattern[nxt_i] if nxt_i < pattern_length else None - - skip = 1 - - if nxt is not None and escape is not None and cur == escape: - yield nxt - skip = 2 - elif cur == "%": - yield ".*" - elif cur == "_": - yield "." - else: - yield cur - - cur_i += skip - - -def sql_like_to_regex(pattern, escape=None): - return f"^{''.join(_sql_like_to_regex(pattern, escape))}$" - - -def string_sqllike_serieswise(arg, pattern, escape): - pat = sql_like_to_regex(pattern, escape) - return arg.str.contains(pat, regex=True) - - -def string_sqlilike_serieswise(arg, pattern, escape): - pat = sql_like_to_regex(pattern, escape) - return arg.str.contains(pat, regex=True, flags=re.IGNORECASE) - - -def extract_userinfo_elementwise(x): - url_parts = urlsplit(x) - username = url_parts.username or "" - password = url_parts.password or "" - return f"{username}:{password}" - - -def extract_queryparam_rowwise(row): - query = urlsplit(row["arg"]).query - param_name = row["key"] - if param_name is not None: - value = parse_qs(query)[param_name] - return value if len(value) > 1 else value[0] - else: - return query - - -def array_index_rowwise(row): - try: - return row["arg"][row["index"]] - except IndexError: - return None - - -def array_position_rowwise(row): - try: - return row["arg"].index(row["other"]) - except ValueError: - return -1 - - -def array_remove_rowwise(row): - if row["arg"] is None: - return None - return [x for x in row["arg"] if x != row["other"]] - - -def array_slice_rowwise(row): - arg, start, stop = row["arg"], row["start"], row["stop"] - if isnull(start) and isnull(stop): - return arg - elif isnull(start): - return arg[:stop] - elif isnull(stop): - return arg[start:] - else: - return arg[start:stop] - - -def integer_range_rowwise(row): - if not row["step"]: - return [] - return list(np.arange(row["start"], row["stop"], row["step"])) - - -def timestamp_range_rowwise(row): - if not row["step"]: - return [] - return list( - pd.date_range(row["start"], row["stop"], freq=row["step"], inclusive="left") - ) - - -def _safe_method(mapping, method, *args, **kwargs): - if isnull(mapping): - return None - try: - method = getattr(mapping, method) - except AttributeError: - return None - else: - result = method(*args, **kwargs) - return None if isnull(result) else result - - -def safe_len(mapping): - return _safe_method(mapping, "__len__") - - -def safe_get(mapping, key, default=None): - return _safe_method(mapping, "get", key, default) - - -def safe_contains(mapping, key): - return _safe_method(mapping, "__contains__", key) - - -def safe_keys(mapping): - result = _safe_method(mapping, "keys") - if result is None: - return None - # list(...) to unpack iterable - return np.array(list(result)) - - -def safe_values(mapping): - result = _safe_method(mapping, "values") - if result is None or result is pd.NA: - return None - # list(...) to unpack iterable - return np.array(list(result), dtype="object") - - -def safe_merge(left, right): - if isnull(left) or isnull(right): - return None - else: - return {**left, **right} - - -def safe_json_getitem(value, key): - try: - # try to deserialize the value -> return None if it's None - if (js := json.loads(value)) is None: - return None - except (json.JSONDecodeError, TypeError): - # if there's an error related to decoding or a type error return None - return None - - try: - # try to extract the value as an array element or mapping key - return js[key] - except (KeyError, IndexError, TypeError): - # KeyError: missing mapping key - # IndexError: missing sequence key - # TypeError: `js` doesn't implement __getitem__, either at all or for - # the type of `key` - return None - - -def safe_decimal(func): - def wrapper(x, **kwargs): - try: - return func(x, **kwargs) - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - return wrapper - - -def round_serieswise(arg, digits): - if digits is None: - return np.round(arg).astype("int64") - else: - return np.round(arg, digits).astype("float64") - - -def arbitrary(arg): - arg = arg.dropna() - return arg.iat[0] if len(arg) else None - - -reductions = { - ops.Min: lambda x: x.min(), - ops.Max: lambda x: x.max(), - ops.Sum: lambda x: x.sum(), - ops.Mean: lambda x: x.mean(), - ops.Count: lambda x: x.count(), - ops.Mode: lambda x: x.mode().iat[0], - ops.Any: lambda x: x.any(), - ops.All: lambda x: x.all(), - ops.Median: lambda x: x.median(), - ops.ApproxMedian: lambda x: x.median(), - ops.BitAnd: lambda x: np.bitwise_and.reduce(x.values), - ops.BitOr: lambda x: np.bitwise_or.reduce(x.values), - ops.BitXor: lambda x: np.bitwise_xor.reduce(x.values), - ops.Arbitrary: arbitrary, - ops.CountDistinct: lambda x: x.nunique(), - ops.ApproxCountDistinct: lambda x: x.nunique(), -} - - -_generic = { - ops.Abs: abs, - ops.Acos: np.arccos, - ops.Add: operator.add, - ops.And: operator.and_, - ops.Asin: np.arcsin, - ops.Atan: np.arctan, - ops.Atan2: np.arctan2, - ops.BitwiseAnd: lambda x, y: np.bitwise_and(x, y), - ops.BitwiseLeftShift: lambda x, y: np.left_shift(x, y).astype("int64"), - ops.BitwiseNot: np.invert, - ops.BitwiseOr: lambda x, y: np.bitwise_or(x, y), - ops.BitwiseRightShift: lambda x, y: np.right_shift(x, y).astype("int64"), - ops.BitwiseXor: lambda x, y: np.bitwise_xor(x, y), - ops.Ceil: lambda x: np.ceil(x).astype("int64"), - ops.Cos: np.cos, - ops.Cot: lambda x: 1 / np.tan(x), - ops.DateAdd: operator.add, - ops.DateDiff: operator.sub, - ops.DateSub: operator.sub, - ops.Degrees: np.degrees, - ops.Divide: operator.truediv, - ops.Equals: operator.eq, - ops.Exp: np.exp, - ops.Floor: lambda x: np.floor(x).astype("int64"), - ops.FloorDivide: operator.floordiv, - ops.Greater: operator.gt, - ops.GreaterEqual: operator.ge, - ops.IdenticalTo: lambda x, y: (x == y) | (pd.isnull(x) & pd.isnull(y)), - ops.IntervalAdd: operator.add, - ops.IntervalFloorDivide: operator.floordiv, - ops.IntervalMultiply: operator.mul, - ops.IntervalSubtract: operator.sub, - ops.Less: operator.lt, - ops.LessEqual: operator.le, - ops.Ln: np.log, - ops.Log10: np.log10, - ops.Log2: np.log2, - ops.Modulus: operator.mod, - ops.Multiply: operator.mul, - ops.Negate: lambda x: not x if isinstance(x, (bool, np.bool_)) else -x, - ops.Not: lambda x: not x if isinstance(x, (bool, np.bool_)) else ~x, - ops.NotEquals: operator.ne, - ops.Or: operator.or_, - ops.Power: operator.pow, - ops.Radians: np.radians, - ops.Sign: np.sign, - ops.Sin: np.sin, - ops.Sqrt: np.sqrt, - ops.Subtract: operator.sub, - ops.Tan: np.tan, - ops.TimestampAdd: operator.add, - ops.TimestampDiff: operator.sub, - ops.TimestampSub: operator.sub, - ops.Xor: operator.xor, - ops.E: lambda: np.e, - ops.Pi: lambda: np.pi, - ops.TimestampNow: lambda: pd.Timestamp("now", tz="UTC").tz_localize(None), - ops.DateNow: lambda: pd.Timestamp(datetime.date.today()), - ops.StringConcat: lambda xs: reduce(operator.add, xs), - ops.StringJoin: lambda xs, sep: reduce(lambda x, y: x + sep + y, xs), - ops.Log: lambda x, base: np.log(x) if base is None else np.log(x) / np.log(base), -} - - -def none_safe(func): - def wrapper(*args, **kwargs): - if any(map(isnull, args)): - return None - return func(*args, **kwargs) - - return wrapper - - -generic = { - **{k: none_safe(v) for k, v in _generic.items()}, - ops.IsNull: pd.isnull, - ops.NotNull: pd.notnull, - ops.IsInf: np.isinf, -} - - -columnwise = { - ops.Clip: lambda df: df["arg"].clip(lower=df["lower"], upper=df["upper"]), - ops.IfElse: lambda df: df["true_expr"].where( - df["bool_expr"], other=df["false_null_expr"] - ), - ops.NullIf: lambda df: df["arg"].where(df["arg"] != df["null_if_expr"]), - ops.Repeat: lambda df: df["arg"] * df["times"], -} - - -rowwise = { - ops.ArrayContains: lambda row: row["other"] in row["arg"], - ops.ArrayIndex: array_index_rowwise, - ops.ArrayPosition: array_position_rowwise, - ops.ArrayRemove: array_remove_rowwise, - ops.ArrayRepeat: lambda row: np.tile(row["arg"], max(0, row["times"])), - ops.ArraySlice: array_slice_rowwise, - ops.ArrayUnion: lambda row: toolz.unique(row["left"] + row["right"]), - ops.EndsWith: lambda row: row["arg"].endswith(row["end"]), - ops.IntegerRange: integer_range_rowwise, - ops.JSONGetItem: lambda row: safe_json_getitem(row["arg"], row["index"]), - ops.Map: lambda row: dict(zip(row["keys"], row["values"])), - ops.MapGet: lambda row: safe_get(row["arg"], row["key"], row["default"]), - ops.MapContains: lambda row: safe_contains(row["arg"], row["key"]), - ops.MapMerge: lambda row: safe_merge(row["left"], row["right"]), - ops.TimestampRange: timestamp_range_rowwise, - ops.LPad: lambda row: row["arg"].rjust(row["length"], row["pad"]), - ops.RegexExtract: lambda row: re.search(row["pattern"], row["arg"]).group( - row["index"] - ), - ops.RegexReplace: lambda row: re.sub( - row["pattern"], row["replacement"], row["arg"] - ), - ops.RegexSearch: lambda row: re.search(row["pattern"], row["arg"]) is not None, - ops.RPad: lambda row: row["arg"].ljust(row["length"], row["pad"]), - ops.StartsWith: lambda row: row["arg"].startswith(row["start"]), - ops.StringContains: lambda row: row["haystack"].contains(row["needle"]), - ops.StringFind: lambda row: row["arg"].find( - row["substr"], row["start"], row["end"] - ), - ops.StringReplace: lambda row: row["arg"].replace( - row["pattern"], row["replacement"] - ), - ops.StringSplit: lambda row: row["arg"].split(row["delimiter"]), - ops.StrRight: lambda row: row["arg"][-row["nchars"] :], - ops.Translate: lambda row: row["arg"].translate( - str.maketrans(row["from_str"], row["to_str"]) - ), - ops.Substring: substring_rowwise, - ops.ExtractQuery: extract_queryparam_rowwise, - ops.Strftime: lambda row: row["arg"].strftime(row["format_str"]), -} - -serieswise = { - ops.Between: lambda arg, lower_bound, upper_bound: arg.between( - lower_bound, upper_bound - ), - ops.Capitalize: lambda arg: arg.str.capitalize(), - ops.Date: lambda arg: arg.dt.floor("d"), - ops.DayOfWeekIndex: lambda arg: pd.to_datetime(arg).dt.dayofweek, - ops.DayOfWeekName: lambda arg: pd.to_datetime(arg).dt.day_name(), - ops.EndsWith: lambda arg, end: arg.str.endswith(end), - ops.ExtractDay: lambda arg: arg.dt.day, - ops.ExtractDayOfYear: lambda arg: arg.dt.dayofyear, - ops.ExtractEpochSeconds: lambda arg: arg.astype("datetime64[s]") - .astype("int64") - .astype("int32"), - ops.ExtractHour: lambda arg: arg.dt.hour, - ops.ExtractMicrosecond: lambda arg: arg.dt.microsecond, - ops.ExtractMillisecond: lambda arg: arg.dt.microsecond // 1000, - ops.ExtractMinute: lambda arg: arg.dt.minute, - ops.ExtractMonth: lambda arg: arg.dt.month, - ops.ExtractQuarter: lambda arg: arg.dt.quarter, - ops.ExtractSecond: lambda arg: arg.dt.second, - ops.ExtractWeekOfYear: lambda arg: arg.dt.isocalendar().week.astype("int32"), - ops.ExtractYear: lambda arg: arg.dt.year, - ops.ExtractIsoYear: lambda arg: arg.dt.isocalendar().year, - ops.IsNull: lambda arg: arg.isnull(), - ops.NotNull: lambda arg: arg.notnull(), - ops.Lowercase: lambda arg: arg.str.lower(), - ops.LPad: lambda arg, length, pad: arg.str.rjust(length, fillchar=pad), - ops.LStrip: lambda arg: arg.str.lstrip(), - ops.Repeat: lambda arg, times: arg.str.repeat(times), - ops.Reverse: lambda arg: arg.str[::-1], - ops.Round: round_serieswise, - ops.RPad: lambda arg, length, pad: arg.str.ljust(length, fillchar=pad), - ops.RStrip: lambda arg: arg.str.rstrip(), - ops.StartsWith: lambda arg, start: arg.str.startswith(start), - ops.StringAscii: lambda arg: arg.map(ord, na_action="ignore").astype("int32"), - ops.StringContains: lambda haystack, needle: haystack.str.contains( - needle, regex=False - ), - ops.StringFind: lambda arg, substr, start, end: arg.str.find(substr, start, end), - ops.StringLength: lambda arg: arg.str.len().astype("int32"), - ops.StringReplace: lambda arg, pattern, replacement: arg.str.replace( - pattern, replacement - ), - ops.StringSplit: lambda arg, delimiter: arg.str.split(delimiter), - ops.StringSQLLike: string_sqllike_serieswise, - ops.StringSQLILike: string_sqlilike_serieswise, - ops.Strip: lambda arg: arg.str.strip(), - ops.Strftime: lambda arg, format_str: arg.dt.strftime(format_str), - ops.StrRight: lambda arg, nchars: arg.str[-nchars:], - ops.Substring: substring_serieswise, - ops.Time: lambda arg: arg.dt.time, - ops.TimestampFromUNIX: lambda arg, unit: pd.to_datetime(arg, unit=unit.short), - ops.Translate: lambda arg, from_str, to_str: arg.str.translate( - str.maketrans(from_str, to_str) - ), - ops.Uppercase: lambda arg: arg.str.upper(), -} - -elementwise = { - ops.ExtractProtocol: lambda x: getattr(urlsplit(x), "scheme", ""), - ops.ExtractAuthority: lambda x: getattr(urlsplit(x), "netloc", ""), - ops.ExtractPath: lambda x: getattr(urlsplit(x), "path", ""), - ops.ExtractFragment: lambda x: getattr(urlsplit(x), "fragment", ""), - ops.ExtractHost: lambda x: getattr(urlsplit(x), "hostname", ""), - ops.ExtractUserInfo: extract_userinfo_elementwise, - ops.StructField: lambda x, field: safe_get(x, field), - ops.ArrayLength: len, - ops.ArrayFlatten: toolz.concat, - ops.ArraySort: sorted, - ops.ArrayDistinct: toolz.unique, - ops.MapLength: safe_len, - ops.MapKeys: safe_keys, - ops.MapValues: safe_values, - ops.Round: lambda x, digits=0: round(x, digits), -} - - -elementwise_decimal = { - ops.Round: lambda x, digits=0: round(x, digits), - ops.Log10: safe_decimal(lambda x: x.log10()), - ops.Ln: safe_decimal(lambda x: x.ln()), - ops.Exp: safe_decimal(lambda x: x.exp()), - ops.Floor: safe_decimal(math.floor), - ops.Ceil: safe_decimal(math.ceil), - ops.Sqrt: safe_decimal(lambda x: x.sqrt()), - ops.Log2: safe_decimal(lambda x: x.ln() / decimal.Decimal(2).ln()), - ops.Sign: safe_decimal(lambda x: math.copysign(1, x)), - ops.Log: safe_decimal(lambda x, base: x.ln() / decimal.Decimal(base).ln()), -} - - -supported_operations = ( - generic.keys() - | columnwise.keys() - | rowwise.keys() - | serieswise.keys() - | elementwise.keys() -) diff --git a/ibis/backends/pandas/rewrites.py b/ibis/backends/pandas/rewrites.py deleted file mode 100644 index 87e55026dae58..0000000000000 --- a/ibis/backends/pandas/rewrites.py +++ /dev/null @@ -1,361 +0,0 @@ -from __future__ import annotations - -from typing import Optional - -from public import public - -import ibis -import ibis.expr.datashape as ds -import ibis.expr.datatypes as dt -import ibis.expr.operations as ops -from ibis.common.annotations import attribute -from ibis.common.collections import FrozenDict -from ibis.common.patterns import InstanceOf, replace -from ibis.common.typing import VarTuple # noqa: TCH001 -from ibis.expr.rewrites import lower_stringslice, p, replace_parameter -from ibis.expr.schema import Schema -from ibis.util import gen_name - - -class PandasRelation(ops.Relation): - pass - - -@public -class PandasRename(PandasRelation): - parent: ops.Relation - mapping: FrozenDict[str, str] - - @classmethod - def from_prefix(cls, parent, prefix): - mapping = {k: f"{prefix}_{k}" for k in parent.schema} - return cls(parent, mapping) - - @attribute - def values(self): - return FrozenDict( - {to: ops.Field(self.parent, from_) for from_, to in self.mapping.items()} - ) - - @attribute - def schema(self): - return Schema( - {self.mapping[name]: dtype for name, dtype in self.parent.schema.items()} - ) - - -@public -class PandasResetIndex(PandasRelation): - parent: ops.Relation - - @attribute - def values(self): - return self.parent.values - - @attribute - def schema(self): - return self.parent.schema - - -@public -class PandasJoin(PandasRelation): - left: ops.Relation - right: ops.Relation - left_on: VarTuple[ops.Value] - right_on: VarTuple[ops.Value] - how: str - - @attribute - def values(self): - return FrozenDict({**self.left.values, **self.right.values}) - - @attribute - def schema(self): - return self.left.schema | self.right.schema - - -@public -class PandasAsofJoin(PandasJoin): - left_by: VarTuple[ops.Value] - right_by: VarTuple[ops.Value] - operator: type - - -@public -class PandasAggregate(PandasRelation): - parent: ops.Relation - groups: FrozenDict[str, ops.Field] - metrics: FrozenDict[str, ops.Reduction] - - @attribute - def values(self): - return FrozenDict({**self.groups, **self.metrics}) - - @attribute - def schema(self): - return Schema({k: v.dtype for k, v in self.values.items()}) - - -@public -class PandasLimit(PandasRelation): - parent: ops.Relation - n: ops.Relation - offset: ops.Relation - - @attribute - def values(self): - return self.parent.values - - @attribute - def schema(self): - return self.parent.schema - - -@public -class PandasScalarSubquery(ops.Value): - # variant with no integrity checks - rel: ops.Relation - - shape = ds.scalar - - @attribute - def dtype(self): - return self.rel.schema.types[0] - - -@public -class PandasWindowFrame(ops.Node): - table: ops.Relation - how: str - start: Optional[ops.Value] - end: Optional[ops.Value] - group_by: VarTuple[ops.Column] - order_by: VarTuple[ops.SortKey] - - -@public -class PandasWindowFunction(ops.Value): - func: ops.Value - frame: PandasWindowFrame - - shape = ds.columnar - - @property - def dtype(self): - return self.func.dtype - - -def is_columnar(node): - return isinstance(node, ops.Value) and node.shape.is_columnar() - - -computable_column = p.Value(shape=ds.columnar) & ~InstanceOf( - ( - ops.Reduction, - ops.Analytic, - ops.SortKey, - ops.WindowFunction, - ops.WindowBoundary, - ) -) - - -@replace(ops.Project) -def rewrite_project(_, **kwargs): - unnests = [] - winfuncs = [] - for v in _.values.values(): - unnests.extend(v.find(ops.Unnest, filter=ops.Value)) - winfuncs.extend(v.find(ops.WindowFunction, filter=ops.Value)) - - if not winfuncs: - return PandasResetIndex(_) if unnests else _ - - selects = {ops.Field(_.parent, k): k for k in _.parent.schema} - for node in winfuncs: - # add computed values from the window function - columns = node.find(computable_column, filter=ops.Value) - for v in columns: - if v not in selects: - selects[v] = gen_name("value") - - # STEP 1: construct the pre-projection - proj = ops.Project(_.parent, {v: k for k, v in selects.items()}) - subs = {node: ops.Field(proj, name) for name, node in proj.values.items()} - - # STEP 2: construct new window function nodes - metrics = {} - for node in winfuncs: - subbed = node.replace(subs, filter=ops.Value) - frame = PandasWindowFrame( - table=proj, - how=subbed.how, - start=subbed.start, - end=subbed.end, - group_by=subbed.group_by, - order_by=subbed.order_by, - ) - metrics[node] = PandasWindowFunction(subbed.func, frame) - - # STEP 3: reconstruct the current projection with the window functions - subs.update(metrics) - values = {k: v.replace(subs, filter=ops.Value) for k, v in _.values.items()} - result = ops.Project(proj, values) - - return PandasResetIndex(result) - - -@replace(ops.Aggregate) -def rewrite_aggregate(_, **kwargs): - selects = {ops.Field(_.parent, k): k for k in _.parent.schema} - for v in _.groups.values(): - if v not in selects: - selects[v] = gen_name("group") - - reductions = {} - for v in _.metrics.values(): - for reduction in v.find(ops.Reduction, filter=ops.Value): - for arg in reduction.find(computable_column, filter=ops.Value): - if arg not in selects: - selects[arg] = gen_name("value") - if reduction not in reductions: - reductions[reduction] = gen_name("reduction") - - # STEP 1: construct the pre-projection - proj = ops.Project(_.parent, {v: k for k, v in selects.items()}) - - # STEP 2: construct the pandas aggregation - subs = {node: ops.Field(proj, name) for name, node in proj.values.items()} - groups = {name: ops.Field(proj, selects[node]) for name, node in _.groups.items()} - metrics = {name: node.replace(subs) for node, name in reductions.items()} - agg = PandasAggregate(proj, groups, metrics) - - # STEP 3: construct the post-projection - subs = {node: ops.Field(agg, name) for node, name in reductions.items()} - values = {name: ops.Field(agg, name) for name, node in _.groups.items()} - values.update({name: node.replace(subs) for name, node in _.metrics.items()}) - return ops.Project(agg, values) - - -def split_join_predicates(left, right, predicates, only_equality=True): - left_on = [] - right_on = [] - for pred in predicates: - if left not in pred.relations or right not in pred.relations: - # not a usual join predicate, so apply a trick by placing the - # predicate to the left side and adding a literal True to the right - # which the left side must be equal to - left_on.append(pred) - right_on.append(ops.Literal(True, dtype=dt.boolean)) - elif isinstance(pred, ops.Binary): - if only_equality and not isinstance(pred, ops.Equals): - raise TypeError("Only equality join predicates supported with pandas") - if left in pred.left.relations and right in pred.right.relations: - left_on.append(pred.left) - right_on.append(pred.right) - elif left in pred.right.relations and right in pred.left.relations: - left_on.append(pred.right) - right_on.append(pred.left) - else: - raise ValueError("Join predicate does not reference both tables") - else: - raise TypeError(f"Unsupported join predicate {pred}") - - return left_on, right_on - - -@replace(ops.JoinChain) -def rewrite_join(_, **kwargs): - # TODO(kszucs): JoinTable.index can be used as a prefix - prefixes = {} - prefixes[_.first] = prefix = str(len(prefixes)) - left = PandasRename.from_prefix(_.first, prefix) - - for link in _.rest: - prefixes[link.table] = prefix = str(len(prefixes)) - right = PandasRename.from_prefix(link.table, prefix) - - subs = {v: ops.Field(left, k) for k, v in left.values.items()} - subs.update({v: ops.Field(right, k) for k, v in right.values.items()}) - preds = [pred.replace(subs, filter=ops.Value) for pred in link.predicates] - - # separate ASOF from the rest of the joins - if link.how == "asof": - on, *by = preds - left_on, right_on = split_join_predicates( - left, right, [on], only_equality=False - ) - left_by, right_by = split_join_predicates(left, right, by) - left = PandasAsofJoin( - how="asof", - left=left, - right=right, - left_on=left_on, - right_on=right_on, - left_by=left_by, - right_by=right_by, - operator=type(on), - ) - else: - # need to replace the fields in the predicates - left_on, right_on = split_join_predicates(left, right, preds) - left = PandasJoin( - how=link.how, - left=left, - right=right, - left_on=left_on, - right_on=right_on, - ) - - subs = {v: ops.Field(left, k) for k, v in left.values.items()} - fields = {k: v.replace(subs, filter=ops.Value) for k, v in _.values.items()} - return ops.Project(left, fields) - - -@replace(ops.Limit) -def rewrite_limit(_, **kwargs): - if isinstance(_.n, ops.Value): - n = _.n.to_expr() - else: - n = ibis.literal(_.n) - - if isinstance(_.offset, ops.Value): - offset = _.offset.to_expr() - else: - offset = ibis.literal(_.offset) - - n = n.as_table().op() - if isinstance(n, ops.Aggregate): - n = rewrite_aggregate.match(n, context={}) - - offset = offset.as_table().op() - if isinstance(offset, ops.Aggregate): - offset = rewrite_aggregate.match(offset, context={}) - - return PandasLimit(_.parent, n, offset) - - -@replace(ops.ScalarSubquery) -def rewrite_scalar_subquery(_, **kwargs): - return PandasScalarSubquery(_.rel) - - -@replace(ops.UnboundTable) -def bind_unbound_table(_, backend, **kwargs): - return ops.DatabaseTable(name=_.name, schema=_.schema, source=backend) - - -def plan(node, backend, params): - ctx = {"params": params, "backend": backend} - node = node.replace(rewrite_scalar_subquery) - node = node.replace( - rewrite_project - | rewrite_aggregate - | rewrite_join - | rewrite_limit - | replace_parameter - | lower_stringslice - | bind_unbound_table, - context=ctx, - ) - return node diff --git a/ibis/backends/pandas/tests/__init__.py b/ibis/backends/pandas/tests/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/ibis/backends/pandas/tests/conftest.py b/ibis/backends/pandas/tests/conftest.py deleted file mode 100644 index 6335038fc584d..0000000000000 --- a/ibis/backends/pandas/tests/conftest.py +++ /dev/null @@ -1,314 +0,0 @@ -from __future__ import annotations - -import decimal -from typing import Any - -import numpy as np -import pandas as pd -import pytest - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.conftest import TEST_TABLES -from ibis.backends.pandas import Backend -from ibis.backends.tests.base import BackendTest -from ibis.backends.tests.data import array_types, json_types, struct_types, topk, win - - -class TestConf(BackendTest): - check_names = False - returned_timestamp_unit = "ns" - stateful = False - rounding_method = "half_to_even" - deps = ("pandas",) - - def _load_data(self, **_: Any) -> None: - import pandas as pd - - con = self.connection - for table_name in TEST_TABLES: - path = self.data_dir / "parquet" / f"{table_name}.parquet" - con.create_table(table_name, pd.read_parquet(path)) - con.create_table("array_types", array_types, overwrite=True) - con.create_table("struct", struct_types, overwrite=True) - con.create_table("win", win, overwrite=True) - con.create_table("json_t", json_types, overwrite=True) - con.create_table("topk", topk.to_pandas(), overwrite=True) - - @staticmethod - def connect(*, tmpdir, worker_id, **kw): - return ibis.pandas.connect(**kw) - - -@pytest.fixture(scope="module") -def df(): - return pd.DataFrame( - { - "plain_int64": list(range(1, 4)), - "plain_strings": list("abc"), - "plain_float64": [4.0, 5.0, 6.0], - "plain_datetimes_naive": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ), - "plain_datetimes_ny": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).dt.tz_localize("America/New_York"), - "plain_datetimes_utc": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).dt.tz_localize("UTC"), - "plain_uint64": pd.Series(range(1, 4), dtype=np.dtype("uint64")), - "dup_strings": list("dad"), - "dup_ints": [1, 2, 1], - "float64_as_strings": ["100.01", "234.23", "-999.34"], - "int64_as_strings": list(map(str, range(1, 4))), - "strings_with_space": [" ", "abab", "ddeeffgg"], - "translate_from_strings": ["rmz", "abc", "ghj"], - "translate_to_strings": ["lns", "ovk", "jfr"], - "int64_with_zeros": [0, 1, 0], - "float64_with_zeros": [1.0, 0.0, 1.0], - "float64_positive": [1.0, 2.0, 1.0], - "strings_with_nulls": ["a", None, "b"], - "datetime_strings_naive": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ).astype(str), - "datetime_strings_ny": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ) - .dt.tz_localize("America/New_York") - .astype(str), - "datetime_strings_utc": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=3).values - ) - .dt.tz_localize("UTC") - .astype(str), - "decimal": list(map(decimal.Decimal, ["1.0", "2", "3.234"])), - "array_of_float64": [ - np.array([1.0, 2.0], dtype="float64"), - np.array([3.0], dtype="float64"), - np.array([], dtype="float64"), - ], - "array_of_int64": [ - np.array([1, 2], dtype="int64"), - np.array([], dtype="int64"), - np.array([3], dtype="int64"), - ], - "array_of_strings": [ - np.array(["a", "b"], dtype="object"), - np.array([], dtype="object"), - np.array(["c"], dtype="object"), - ], - "map_of_strings_integers": [{"a": 1, "b": 2}, None, {}], - "map_of_integers_strings": [{}, None, {1: "a", 2: "b"}], - "map_of_complex_values": [None, {"a": [1, 2, 3], "b": []}, {}], - } - ) - - -@pytest.fixture(scope="module") -def batting_df(data_dir): - num_rows = 1000 - start_index = 30 - df = pd.read_parquet(data_dir / "parquet" / "batting.parquet").iloc[ - start_index : start_index + num_rows - ] - return df.reset_index(drop=True) - - -@pytest.fixture(scope="module") -def awards_players_df(data_dir): - return pd.read_parquet(data_dir / "parquet" / "awards_players.parquet") - - -@pytest.fixture(scope="module") -def df1(): - return pd.DataFrame( - {"key": list("abcd"), "value": [3, 4, 5, 6], "key2": list("eeff")} - ) - - -@pytest.fixture(scope="module") -def df2(): - return pd.DataFrame( - {"key": list("ac"), "other_value": [4.0, 6.0], "key3": list("fe")} - ) - - -@pytest.fixture(scope="module") -def intersect_df2(): - return pd.DataFrame({"key": list("cd"), "value": [5, 6], "key2": list("ff")}) - - -@pytest.fixture(scope="module") -def time_df1(): - return pd.DataFrame( - {"time": pd.to_datetime([1, 2, 3, 4]), "value": [1.1, 2.2, 3.3, 4.4]} - ) - - -@pytest.fixture(scope="module") -def time_df2(): - return pd.DataFrame({"time": pd.to_datetime([2, 4]), "other_value": [1.2, 2.0]}) - - -@pytest.fixture(scope="module") -def time_df3(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=8).values - ), - "id": list(range(1, 5)) * 2, - "value": [1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8], - } - ) - - -@pytest.fixture(scope="module") -def time_keyed_df1(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range(start="2017-01-02 01:02:03.234", periods=6).values - ), - "key": [1, 2, 3, 1, 2, 3], - "value": [1.2, 1.4, 2.0, 4.0, 8.0, 16.0], - } - ) - - -@pytest.fixture(scope="module") -def time_keyed_df2(): - return pd.DataFrame( - { - "time": pd.Series( - pd.date_range( - start="2017-01-02 01:02:03.234", freq="3D", periods=3 - ).values - ), - "key": [1, 2, 3], - "other_value": [1.1, 1.2, 2.2], - } - ) - - -@pytest.fixture(scope="module") -def client( - df, - df1, - df2, - df3, - time_df1, - time_df2, - time_df3, - time_keyed_df1, - time_keyed_df2, - intersect_df2, -): - return Backend().connect( - { - "df": df, - "df1": df1, - "df2": df2, - "df3": df3, - "left": df1, - "right": df2, - "time_df1": time_df1, - "time_df2": time_df2, - "time_df3": time_df3, - "time_keyed_df1": time_keyed_df1, - "time_keyed_df2": time_keyed_df2, - "intersect_df2": intersect_df2, - } - ) - - -@pytest.fixture(scope="module") -def df3(): - return pd.DataFrame( - { - "key": list("ac"), - "other_value": [4.0, 6.0], - "key2": list("ae"), - "key3": list("fe"), - } - ) - - -t_schema = { - "decimal": dt.Decimal(4, 3), - "array_of_float64": dt.Array(dt.double), - "array_of_int64": dt.Array(dt.int64), - "array_of_strings": dt.Array(dt.string), - "map_of_strings_integers": dt.Map(dt.string, dt.int64), - "map_of_integers_strings": dt.Map(dt.int64, dt.string), - "map_of_complex_values": dt.Map(dt.string, dt.Array(dt.int64)), -} - - -@pytest.fixture(scope="module") -def t(client): - return client.table("df", schema=t_schema) - - -@pytest.fixture(scope="module") -def lahman(batting_df, awards_players_df): - return Backend().connect( - {"batting": batting_df, "awards_players": awards_players_df} - ) - - -@pytest.fixture(scope="module") -def left(client): - return client.table("left") - - -@pytest.fixture(scope="module") -def right(client): - return client.table("right") - - -@pytest.fixture(scope="module") -def time_left(client): - return client.table("time_df1") - - -@pytest.fixture(scope="module") -def time_right(client): - return client.table("time_df2") - - -@pytest.fixture(scope="module") -def time_keyed_left(client): - return client.table("time_keyed_df1") - - -@pytest.fixture(scope="module") -def time_keyed_right(client): - return client.table("time_keyed_df2") - - -@pytest.fixture(scope="module") -def batting(lahman): - return lahman.table("batting") - - -@pytest.fixture(scope="module") -def sel_cols(batting): - cols = batting.columns - start, end = cols.index("AB"), cols.index("H") + 1 - return ["playerID", "yearID", "teamID", "G"] + cols[start:end] - - -@pytest.fixture(scope="module") -def players_base(batting, sel_cols): - return batting[sel_cols].order_by(sel_cols[:3]) - - -@pytest.fixture(scope="module") -def players(players_base): - return players_base.group_by("playerID") - - -@pytest.fixture(scope="module") -def players_df(players_base): - return players_base.execute().reset_index(drop=True) diff --git a/ibis/backends/pandas/tests/test_arrays.py b/ibis/backends/pandas/tests/test_arrays.py deleted file mode 100644 index 9b657eb9cf3c7..0000000000000 --- a/ibis/backends/pandas/tests/test_arrays.py +++ /dev/null @@ -1,222 +0,0 @@ -from __future__ import annotations - -import numpy as np -import numpy.testing as nt -import pandas as pd -import pytest - -import ibis -from ibis.backends.pandas.tests.conftest import TestConf as tm - - -@pytest.mark.parametrize("arr", [[1, 3, 5], np.array([1, 3, 5])]) -@pytest.mark.parametrize("create_arr_expr", [ibis.literal, ibis.array]) -def test_array_literal(client, arr, create_arr_expr): - expr = create_arr_expr(arr) - result = client.execute(expr) - expected = np.array([1, 3, 5]) - nt.assert_array_equal(result, expected) - - -def test_array_length(t): - expr = t.select( - t.array_of_float64.length().name("array_of_float64_length"), - t.array_of_int64.length().name("array_of_int64_length"), - t.array_of_strings.length().name("array_of_strings_length"), - ) - result = expr.execute() - expected = pd.DataFrame( - { - "array_of_float64_length": [2, 1, 0], - "array_of_int64_length": [2, 0, 1], - "array_of_strings_length": [2, 0, 1], - } - ) - - tm.assert_frame_equal(result, expected) - - -def test_array_slice_using_column(t): - expr = t.array_of_int64[t.plain_int64 :] - result = expr.execute() - expected = pd.Series([[2], [], []]) - tm.assert_series_equal(result, expected) - - -def test_array_length_scalar(client): - raw_value = np.array([1, 2, 4]) - value = ibis.array(raw_value) - expr = value.length() - result = client.execute(expr) - expected = len(raw_value) - assert result == expected - - -def test_array_collect(t, df): - expr = t.float64_with_zeros.collect() - result = expr.execute() - expected = np.array(df.float64_with_zeros) - nt.assert_array_equal(result, expected) - - -def test_array_collect_grouped(t, df): - expr = t.group_by(t.dup_strings).aggregate(collected=t.float64_with_zeros.collect()) - result = expr.execute().sort_values("dup_strings").reset_index(drop=True) - expected = ( - df.groupby("dup_strings") - .float64_with_zeros.apply(np.array) - .reset_index() - .rename(columns={"float64_with_zeros": "collected"}) - ) - tm.assert_frame_equal(result, expected) - - -def test_array_collect_rolling_partitioned(t, df): - window = ibis.trailing_window(1, order_by=t.plain_int64) - colexpr = t.plain_float64.collect().over(window) - expr = t.select("dup_strings", "plain_int64", colexpr.name("collected")) - result = expr.execute() - expected = pd.DataFrame( - { - "dup_strings": ["d", "a", "d"], - "plain_int64": [1, 2, 3], - "collected": [[4.0], [4.0, 5.0], [5.0, 6.0]], - } - )[expr.columns] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - ["start", "stop"], - [ - (1, 3), - (1, 1), - (2, 3), - (2, 5), - (None, 3), - (None, None), - (3, None), - (-3, None), - (None, -3), - (-3, -1), - ], -) -def test_array_slice(t, df, start, stop): - expr = t.array_of_strings[start:stop] - result = expr.execute() - expected = df.array_of_strings.apply(lambda x: x[start:stop].tolist()) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ["start", "stop"], - [ - (1, 3), - (1, 1), - (2, 3), - (2, 5), - (None, 3), - (None, None), - (3, None), - (-3, None), - (None, -3), - (-3, -1), - ], -) -def test_array_slice_scalar(client, start, stop): - raw_value = np.array([-11, 42, 10]) - value = ibis.array(raw_value) - expr = value[start:stop] - result = client.execute(expr) - expected = raw_value[start:stop] - nt.assert_array_equal(result, expected) - - -@pytest.mark.parametrize("index", [1, 3, 4, 11, -11]) -def test_array_index(t, df, index): - expr = t.select(t.array_of_float64[index].name("indexed")) - result = expr.execute() - expected = pd.DataFrame( - { - "indexed": df.array_of_float64.apply( - lambda x: x[index] if -len(x) <= index < len(x) else np.nan - ) - } - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("index", [1, 3, 4, 11]) -def test_array_index_scalar(client, index): - raw_value = np.array([-10, 1, 2, 42]) - value = ibis.array(raw_value) - expr = value[index] - result = client.execute(expr) - expected = raw_value[index] if index < len(raw_value) else None - assert result == expected - - -@pytest.mark.parametrize("n", [1, 3, 4, 7, -2]) # negative returns empty list -@pytest.mark.parametrize("mul", [lambda x, n: x * n, lambda x, n: n * x]) -def test_array_repeat(t, df, n, mul): - expr = mul(t.array_of_strings, n) - result = expr.execute() - expected = df.apply( - lambda row: np.tile(row.array_of_strings, max(n, 0)).tolist(), - axis=1, - ) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("n", [1, 3, 4, 7, -2]) # negative returns empty list -@pytest.mark.parametrize("mul", [lambda x, n: x * n, lambda x, n: n * x]) -def test_array_repeat_scalar(client, n, mul): - raw_array = np.array([1, 2]) - array = ibis.array(raw_array) - expr = mul(array, n) - result = client.execute(expr) - if n > 0: - expected = np.tile(raw_array, n) - else: - expected = np.array([], dtype=raw_array.dtype) - nt.assert_array_equal(result, expected) - - -@pytest.mark.parametrize( - ["op", "op_raw"], - [ - (lambda x, y: x + y, lambda x, y: np.concatenate([x, y])), - (lambda x, y: y + x, lambda x, y: np.concatenate([y, x])), - ], -) -def test_array_concat(t, df, op, op_raw): - x = t.array_of_float64.cast("array") - y = t.array_of_strings - expr = op(x, y) - result = expr.execute() - expected = df.apply( - lambda row: op_raw( - np.array(list(map(str, row.array_of_float64))), # Mimic .cast() - row.array_of_strings, - ), - axis=1, - ) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ["op", "op_raw"], - [ - (lambda x, y: x + y, lambda x, y: np.concatenate([x, y])), - (lambda x, y: y + x, lambda x, y: np.concatenate([y, x])), - ], -) -def test_array_concat_scalar(client, op, op_raw): - raw_left = np.array([1, 2, 3]) - raw_right = np.array([3, 4]) - left = ibis.array(raw_left) - right = ibis.array(raw_right) - expr = op(left, right) - result = client.execute(expr) - expected = op_raw(raw_left, raw_right) - nt.assert_array_equal(result, expected) diff --git a/ibis/backends/pandas/tests/test_cast.py b/ibis/backends/pandas/tests/test_cast.py deleted file mode 100644 index 3f166e79464f4..0000000000000 --- a/ibis/backends/pandas/tests/test_cast.py +++ /dev/null @@ -1,189 +0,0 @@ -from __future__ import annotations - -import decimal - -import numpy as np -import pandas as pd -import pytest - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.conftest import is_older_than -from ibis.backends.pandas.tests.conftest import TestConf as tm - -TIMESTAMP = "2022-03-13 06:59:10.467417" - - -@pytest.mark.parametrize("from_", ["plain_float64", "plain_int64"]) -@pytest.mark.parametrize( - ("to", "expected"), - [ - ("float16", "float16"), - ("float32", "float32"), - ("float64", "float64"), - ("float", "float64"), - ("int8", "int8"), - ("int16", "int16"), - ("int32", "int32"), - ("int64", "int64"), - ("string", "object"), - ], -) -def test_cast_numeric(t, df, from_, to, expected): - c = t[from_].cast(to) - result = c.execute() - assert str(result.dtype) == expected - - -@pytest.mark.parametrize("from_", ["float64_as_strings", "int64_as_strings"]) -@pytest.mark.parametrize( - ("to", "expected"), [("double", "float64"), ("string", "object")] -) -def test_cast_string(t, df, from_, to, expected): - c = t[from_].cast(to) - result = c.execute() - assert str(result.dtype) == expected - - -@pytest.mark.parametrize("from_", ["array_of_int64", "array_of_float64"]) -@pytest.mark.parametrize( - ("to", "expected"), - [("array", dt.float64), ("array", dt.int64)], -) -def test_cast_array(t, from_, to, expected): - c = t[from_].cast(to) - result = c.execute() - - # The Series of arrays - assert result.dtype == np.object_ - - # One of the arrays in the Series - res = result[0] - assert isinstance(res, list) - - for v in result: - assert v == [dt.normalize(expected, x) for x in v] - - -@pytest.mark.parametrize( - ("to", "expected"), - [ - pytest.param( - "string", - "object", - marks=pytest.mark.skipif( - is_older_than("pandas", "2.1.0"), reason="raises a NotImplementedError" - ), - ), - ("int64", "int64"), - ("double", "float64"), - ( - dt.Timestamp("America/Los_Angeles"), - "datetime64[ns, America/Los_Angeles]", - ), - ( - "timestamp('America/Los_Angeles')", - "datetime64[ns, America/Los_Angeles]", - ), - ], -) -@pytest.mark.parametrize( - "column", - ["plain_datetimes_naive", "plain_datetimes_ny", "plain_datetimes_utc"], -) -def test_cast_timestamp_column(t, df, column, to, expected): - c = t[column].cast(to) - result = c.execute() - assert str(result.dtype) == expected - - -@pytest.mark.parametrize( - ("to", "expected"), - [ - pytest.param( - "string", - str, - marks=pytest.mark.skipif( - is_older_than("pandas", "2.1.0"), reason="raises a NotImplementedError" - ), - ), - ("int64", lambda x: pd.Timestamp(x).value // int(1e9)), - ("double", lambda x: float(pd.Timestamp(x).value // int(1e9))), - ( - dt.Timestamp("America/Los_Angeles"), - lambda x: x.tz_localize(tz="America/Los_Angeles"), - ), - ], -) -def test_cast_timestamp_scalar_naive(client, to, expected): - literal_expr = ibis.literal(pd.Timestamp(TIMESTAMP)) - value = literal_expr.cast(to) - result = client.execute(value) - raw = client.execute(literal_expr) - assert result == expected(raw) - - -@pytest.mark.parametrize( - ("to", "expected"), - [ - pytest.param( - "string", - str, - marks=pytest.mark.skipif( - is_older_than("pandas", "2.1.0"), reason="raises a NotImplementedError" - ), - ), - ("int64", lambda x: pd.Timestamp(x).value // int(1e9)), - ("double", lambda x: float(pd.Timestamp(x).value // int(1e9))), - ( - dt.Timestamp("America/Los_Angeles"), - lambda x: x.astimezone(tz="America/Los_Angeles"), - ), - ], -) -@pytest.mark.parametrize("tz", ["UTC", "America/New_York"]) -def test_cast_timestamp_scalar(client, to, expected, tz): - literal_expr = ibis.literal(pd.Timestamp(TIMESTAMP).tz_localize(tz)) - value = literal_expr.cast(to) - result = client.execute(value) - raw = client.execute(literal_expr) - assert result == expected(raw) - - -def test_timestamp_with_timezone_is_inferred_correctly(t, df): - assert t.plain_datetimes_naive.type().equals(dt.timestamp) - assert t.plain_datetimes_ny.type().equals(dt.Timestamp("America/New_York")) - assert t.plain_datetimes_utc.type().equals(dt.Timestamp("UTC")) - - -@pytest.mark.parametrize( - "column", - ["plain_datetimes_naive", "plain_datetimes_ny", "plain_datetimes_utc"], -) -def test_cast_date(t, df, column): - expr = t[column].cast("date") - result = expr.execute() - expected = df[column].dt.normalize().dt.tz_localize(None).dt.date - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("type", [dt.Decimal(9, 2), dt.Decimal(12, 3)]) -def test_cast_to_decimal(t, df, type): - expr = t.float64_as_strings.cast(type) - result = expr.execute() - context = decimal.Context(prec=type.precision) - expected = df.float64_as_strings.apply( - lambda x: context.create_decimal(x).quantize( - decimal.Decimal( - "{}.{}".format("0" * (type.precision - type.scale), "0" * type.scale) - ) - ) - ) - tm.assert_series_equal(result, expected) - assert all( - abs(element.as_tuple().exponent) == type.scale for element in result.values - ) - assert all( - 1 <= len(element.as_tuple().digits) <= type.precision - for element in result.values - ) diff --git a/ibis/backends/pandas/tests/test_client.py b/ibis/backends/pandas/tests/test_client.py deleted file mode 100644 index e08098635f28a..0000000000000 --- a/ibis/backends/pandas/tests/test_client.py +++ /dev/null @@ -1,97 +0,0 @@ -from __future__ import annotations - -import numpy as np -import pandas as pd -import pandas.testing as tm -import pyarrow as pa -import pytest -from pytest import param - -import ibis -import ibis.expr.operations as ops - - -@pytest.fixture -def client(): - return ibis.pandas.connect( - { - "df": pd.DataFrame({"a": [1, 2, 3], "b": list("abc")}), - "df_unknown": pd.DataFrame({"array_of_strings": [["a", "b"], [], ["c"]]}), - } - ) - - -@pytest.fixture -def table(client): - return client.table("df") - - -@pytest.fixture -def test_data(): - return pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": list("abcde")}) - - -def test_connect_no_args(): - con = ibis.pandas.connect() - assert dict(con.tables) == {} - - -def test_client_table(table): - assert isinstance(table.op(), ops.DatabaseTable) - - -@pytest.mark.parametrize( - "lamduh", - [(lambda df: df), (lambda df: pa.Table.from_pandas(df))], - ids=["dataframe", "pyarrow table"], -) -def test_create_table(client, test_data, lamduh): - test_data = lamduh(test_data) - client.create_table("testing", obj=test_data) - assert "testing" in client.list_tables() - client.create_table("testingschema", schema=client.get_schema("testing")) - assert "testingschema" in client.list_tables() - - -def test_literal(client): - lit = ibis.literal(1) - result = client.execute(lit) - assert result == 1 - - -def test_list_tables(client): - assert client.list_tables(like="df_unknown") - assert not client.list_tables(like="not_in_the_database") - assert client.list_tables() - - -def test_drop(table): - table = table.mutate(c=table.a) - expr = table.drop("a") - result = expr.execute() - expected = table[["b", "c"]].execute() - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "unit", - [ - "Y", - "M", - "D", - "h", - "m", - "s", - "ms", - "us", - "ns", - param("ps", marks=pytest.mark.xfail), - param("fs", marks=pytest.mark.xfail), - param("as", marks=pytest.mark.xfail), - ], -) -def test_datetime64_infer(client, unit): - value = np.datetime64("2018-01-02", unit) - expr = ibis.literal(value, type="timestamp") - result = client.execute(expr) - assert result == pd.Timestamp(value).to_pydatetime() diff --git a/ibis/backends/pandas/tests/test_core.py b/ibis/backends/pandas/tests/test_core.py deleted file mode 100644 index 45e3a3a02b943..0000000000000 --- a/ibis/backends/pandas/tests/test_core.py +++ /dev/null @@ -1,69 +0,0 @@ -from __future__ import annotations - -import pandas as pd -import pandas.testing as tm -import pytest - -import ibis -import ibis.common.exceptions as com -from ibis.backends.pandas import Backend - - -@pytest.fixture -def dataframe(): - return pd.DataFrame( - { - "plain_int64": list(range(1, 4)), - "plain_strings": list("abc"), - "dup_strings": list("dad"), - } - ) - - -@pytest.fixture -def core_client(dataframe): - return Backend().connect({"df": dataframe}) - - -@pytest.fixture -def ibis_table(core_client): - return core_client.table("df") - - -def test_from_dataframe(dataframe, ibis_table, core_client): - t = Backend().from_dataframe(dataframe) - result = t.execute() - expected = ibis_table.execute() - tm.assert_frame_equal(result, expected) - - t = Backend().from_dataframe(dataframe, name="foo") - expected = ibis_table.execute() - tm.assert_frame_equal(result, expected) - - client = core_client - t = Backend().from_dataframe(dataframe, name="foo", client=client) - expected = ibis_table.execute() - tm.assert_frame_equal(result, expected) - - -def test_execute_parameter_only(): - param = ibis.param("int64") - con = ibis.pandas.connect() - result = con.execute(param, params={param.op(): 42}) - assert result == 42 - - -def test_missing_data_sources(): - t = ibis.table([("a", "string")], name="t") - expr = t.a.length() - con = ibis.pandas.connect() - with pytest.raises(com.UnboundExpressionError): - con.execute(expr) - - -def test_unbound_table_execution(): - t = ibis.table([("a", "string")], name="t") - expr = t.a.length() - con = ibis.pandas.connect({"t": pd.DataFrame({"a": ["a", "ab", "abc"]})}) - result = con.execute(expr) - assert result.tolist() == [1, 2, 3] diff --git a/ibis/backends/pandas/tests/test_functions.py b/ibis/backends/pandas/tests/test_functions.py deleted file mode 100644 index 8de16141160f5..0000000000000 --- a/ibis/backends/pandas/tests/test_functions.py +++ /dev/null @@ -1,291 +0,0 @@ -from __future__ import annotations - -import decimal -import functools -import math -import operator -from operator import methodcaller - -import numpy as np -import pandas as pd -import pytest -from pytest import param - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.pandas.tests.conftest import TestConf as tm -from ibis.backends.pandas.udf import udf - - -@pytest.mark.parametrize( - "op", - [ - # comparison - operator.eq, - operator.ne, - operator.lt, - operator.le, - operator.gt, - operator.ge, - ], -) -def test_binary_operations(t, df, op): - expr = op(t.plain_float64, t.plain_int64) - result = expr.execute() - expected = op(df.plain_float64, df.plain_int64) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("op", [operator.and_, operator.or_, operator.xor]) -def test_binary_boolean_operations(t, df, op): - expr = op(t.plain_int64 == 1, t.plain_int64 == 2) - result = expr.execute() - expected = op(df.plain_int64 == 1, df.plain_int64 == 2) - tm.assert_series_equal(result, expected) - - -def operate(func): - @functools.wraps(func) - def wrapper(*args, **kwargs): - try: - return func(*args, **kwargs) - except decimal.InvalidOperation: - return decimal.Decimal("NaN") - - return wrapper - - -@pytest.mark.parametrize( - ("ibis_func", "pandas_func"), - [ - param(methodcaller("round"), round, id="round"), - param( - methodcaller("round", 2), - lambda x: x.quantize(decimal.Decimal(".00")), - id="round_2", - ), - param( - methodcaller("round", 0), - lambda x: x.quantize(decimal.Decimal("0.")), - id="round_0", - ), - param(methodcaller("ceil"), lambda x: decimal.Decimal(math.ceil(x)), id="ceil"), - param( - methodcaller("floor"), lambda x: decimal.Decimal(math.floor(x)), id="floor" - ), - param( - methodcaller("sign"), - lambda x: x if not x else decimal.Decimal(1).copy_sign(x), - id="sign", - ), - param(methodcaller("sqrt"), operate(lambda x: x.sqrt()), id="sqrt"), - param( - methodcaller("log", 2), - operate(lambda x: x.ln() / decimal.Decimal(2).ln()), - id="log_2", - ), - param(methodcaller("ln"), operate(lambda x: x.ln()), id="ln"), - param( - methodcaller("log2"), - operate(lambda x: x.ln() / decimal.Decimal(2).ln()), - id="log2", - ), - param(methodcaller("log10"), operate(lambda x: x.log10()), id="log10"), - ], -) -def test_math_functions_decimal(t, df, ibis_func, pandas_func): - dtype = dt.Decimal(12, 3) - context = decimal.Context(prec=dtype.precision) - - def normalize(x): - x = context.create_decimal(x) - p = decimal.Decimal( - f"{'0' * (dtype.precision - dtype.scale)}.{'0' * dtype.scale}" - ) - return x.quantize(p) - - expr = ibis_func(t.float64_as_strings.cast(dtype)) - result = expr.execute() - - expected = ( - df.float64_as_strings.apply(normalize).apply(pandas_func).apply(normalize) - ) - tm.assert_series_equal(result, expected.astype(expr.type().to_pandas())) - - -def test_round_decimal_with_negative_places(t): - type = dt.Decimal(12, 3) - expr = t.float64_as_strings.cast(type).round(-1) - result = expr.execute() - expected = pd.Series( - list(map(decimal.Decimal, ["1.0E+2", "2.3E+2", "-1.00E+3"])), - name="float64_as_strings", - ) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ("ibis_func", "pandas_func"), - [ - (lambda x: x.quantile(0), lambda x: x.quantile(0)), - (lambda x: x.quantile(1), lambda x: x.quantile(1)), - (lambda x: x.quantile(0.5), lambda x: x.quantile(0.5)), - ], -) -def test_quantile(t, df, ibis_func, pandas_func): - result = ibis_func(t.float64_with_zeros).execute() - expected = pandas_func(df.float64_with_zeros) - assert result == expected - - assert result == expected - - result = ibis_func(t.int64_with_zeros).execute() - expected = pandas_func(df.int64_with_zeros) - assert result == expected - - -@pytest.mark.parametrize( - ("ibis_func", "pandas_func"), - [ - ( - lambda x: x.quantile([0.25, 0.75]), - lambda x: np.array(x.quantile([0.25, 0.75])), - ) - ], -) -@pytest.mark.parametrize("column", ["float64_with_zeros", "int64_with_zeros"]) -def test_quantile_multi(t, df, ibis_func, pandas_func, column): - expr = ibis_func(t[column]) - result = expr.execute() - expected = pandas_func(df[column]) - np.testing.assert_array_equal(result, expected) - - -@pytest.mark.parametrize( - ("ibis_func", "exc"), - [ - # no lower/upper specified - (lambda x: x.clip(), ValueError), - # out of range on quantile - (lambda x: x.quantile(5.0), ValueError), - ], -) -def test_arraylike_functions_transform_errors(t, ibis_func, exc): - with pytest.raises(exc): - ibis_func(t.float64_with_zeros).execute() - - -def test_quantile_multi_array_access(client, t, df): - quantile = t.float64_with_zeros.quantile([0.25, 0.5]) - expr = quantile[0], quantile[1] - result = tuple(map(client.execute, expr)) - expected = tuple(df.float64_with_zeros.quantile([0.25, 0.5])) - assert result == expected - - -@pytest.mark.parametrize( - ( - "left", - "right", - "expected_value", - "expected_type", - "left_dtype", - "right_dtype", - ), - [ - (True, 1, True, bool, dt.boolean, dt.int64), - (True, 1.0, True, bool, dt.boolean, dt.float64), - (True, True, True, bool, dt.boolean, dt.boolean), - (False, 0, False, bool, dt.boolean, dt.int64), - (False, 0.0, False, bool, dt.boolean, dt.float64), - (False, False, False, bool, dt.boolean, dt.boolean), - (1, True, 1, int, dt.int64, dt.boolean), - (1, 1.0, 1, int, dt.int64, dt.float64), - (1, 1, 1, int, dt.int64, dt.int64), - (0, False, 0, int, dt.int64, dt.boolean), - (0, 0.0, 0, int, dt.int64, dt.float64), - (0, 0, 0, int, dt.int64, dt.int64), - (1.0, True, 1.0, float, dt.float64, dt.boolean), - (1.0, 1, 1.0, float, dt.float64, dt.int64), - (1.0, 1.0, 1.0, float, dt.float64, dt.float64), - (0.0, False, 0.0, float, dt.float64, dt.boolean), - (0.0, 0, 0.0, float, dt.float64, dt.int64), - (0.0, 0.0, 0.0, float, dt.float64, dt.float64), - ], -) -def test_execute_with_same_hash_value_in_scope( - left, right, expected_value, expected_type, left_dtype, right_dtype -): - with pytest.warns(FutureWarning, match="v9.0"): - - @udf.elementwise([left_dtype, right_dtype], left_dtype) - def my_func(x, _): - return x - - df = pd.DataFrame({"left": [left], "right": [right]}) - con = ibis.pandas.connect() - table = con.from_dataframe(df) - - expr = my_func(table.left, table.right) - result = con.execute(expr) - assert isinstance(result, pd.Series) - - result = result.tolist() - assert result == [expected_value] - assert type(result[0]) is expected_type - - -def test_ifelse_returning_bool(): - one = ibis.literal(1) - two = ibis.literal(2) - true = ibis.literal(True) - false = ibis.literal(False) - expr = ibis.ifelse(one + one == two, true, false) - result = ibis.pandas.connect().execute(expr) - assert result is True or result is np.True_ - - -@pytest.mark.parametrize( - ("dtype", "value"), - [ - pytest.param(dt.float64, 1, id="float_int"), - pytest.param(dt.float64, True, id="float_bool"), - pytest.param(dt.int64, 1.0, id="int_float"), - pytest.param(dt.int64, True, id="int_bool"), - pytest.param(dt.boolean, 1.0, id="bool_float"), - pytest.param(dt.boolean, 1, id="bool_int"), - ], -) -def test_signature_does_not_match_input_type(dtype, value): - with pytest.warns(FutureWarning, match="v9.0"): - - @udf.elementwise([dtype], dtype) - def func(x): - return x - - df = pd.DataFrame({"col": [value]}) - table = ibis.pandas.connect().from_dataframe(df) - - result = table.col.execute() - assert isinstance(result, pd.Series) - - result = result.tolist() - assert result == [value] - assert type(result[0]) is type(value) - - -@pytest.mark.parametrize( - ("ibis_func", "pandas_func"), - [ - ( - lambda x: x.approx_median(), - lambda x: x.median(), - ) - ], -) -@pytest.mark.parametrize("column", ["float64_with_zeros", "int64_with_zeros"]) -def test_approx_median(t, df, ibis_func, pandas_func, column): - expr = ibis_func(t[column]) - result = expr.execute() - expected = pandas_func(df[column]) - assert expected == result diff --git a/ibis/backends/pandas/tests/test_helpers.py b/ibis/backends/pandas/tests/test_helpers.py deleted file mode 100644 index 4814a0d853763..0000000000000 --- a/ibis/backends/pandas/tests/test_helpers.py +++ /dev/null @@ -1,72 +0,0 @@ -from __future__ import annotations - -import pytest - -from ibis.backends.pandas.helpers import RowsFrame - -lst = list(range(10)) - - -@pytest.mark.parametrize( - ("ix", "start", "end", "expected"), - [ - (0, None, None, lst), - (0, 0, None, lst), - (0, None, 0, [0]), - (0, 0, 0, [0]), - (0, 0, 1, [0, 1]), - (0, 1, 1, [1]), - (0, 1, 2, [1, 2]), - (0, 1, None, lst[1:]), - (0, None, 1, [0, 1]), - (0, -1, None, lst), - (0, None, -1, []), - (0, -1, -1, []), - (0, -2, -1, []), - (0, -2, None, lst), - (0, None, -2, []), - (0, -1, 1, [0, 1]), - (0, 1, -1, []), - (0, -1, 2, [0, 1, 2]), - (1, None, None, lst), - (1, 0, None, lst[1:]), - (1, None, 0, [0, 1]), - (1, 0, 0, [1]), - (1, 0, 1, [1, 2]), - (1, 1, 1, [2]), - (1, 1, 2, [2, 3]), - (1, 1, None, lst[2:]), - (1, None, 1, [0, 1, 2]), - (1, -1, None, lst), - (1, None, -1, [0]), - (1, -1, -1, [0]), - (1, -2, -1, [0]), - (1, -2, None, lst), - (1, None, -2, []), - (1, -1, 1, [0, 1, 2]), - (1, 1, -1, []), - (1, -1, 2, [0, 1, 2, 3]), - (2, None, None, lst), - (2, 0, None, lst[2:]), - (2, None, 0, [0, 1, 2]), - (2, 0, 0, [2]), - (2, 0, 1, [2, 3]), - (2, 1, 1, [3]), - (2, 1, 2, [3, 4]), - (2, 1, None, lst[3:]), - (2, None, 1, [0, 1, 2, 3]), - (2, -1, None, lst[1:]), - (2, None, -1, [0, 1]), - (2, -1, -1, [1]), - (2, -2, -1, [0, 1]), - (2, -2, None, lst), - (2, None, -2, [0]), - (2, -1, 1, [1, 2, 3]), - (2, 1, -1, []), - (2, -1, 2, [1, 2, 3, 4]), - (3, None, None, lst), - ], -) -def test_rows_frame_adjustment(ix, start, end, expected): - start_index, end_index = RowsFrame.adjust(len(lst), ix, start, end) - assert lst[start_index:end_index] == expected diff --git a/ibis/backends/pandas/tests/test_join.py b/ibis/backends/pandas/tests/test_join.py deleted file mode 100644 index 4d44efd1c63a2..0000000000000 --- a/ibis/backends/pandas/tests/test_join.py +++ /dev/null @@ -1,681 +0,0 @@ -from __future__ import annotations - -import numpy as np -import pandas as pd -import pandas.testing as tm -import pytest - -import ibis -from ibis.backends.conftest import is_older_than - -# SEMI and ANTI are checked in backend tests -mutating_join_type = pytest.mark.parametrize( - "how", - ["inner", "left", "right", "outer"], -) - - -@mutating_join_type -def test_join(how, left, right, df1, df2): - expr = left.join(right, left.key == right.key, how=how).select( - left, right.other_value, right.key3 - ) - result = expr.execute() - expected = pd.merge(df1, df2, how=how, on="key") - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_cross_join(left, right, df1, df2): - expr = left.cross_join(right).select(left, right.other_value, right.key3) - result = expr.execute() - expected = pd.merge( - df1.assign(dummy=1), df2.assign(dummy=1), how="inner", on="dummy" - ).rename(columns={"key_x": "key"}) - del expected["dummy"], expected["key_y"] - tm.assert_frame_equal(result[expected.columns], expected) - - -@mutating_join_type -def test_join_project_left_table(how, left, right, df1, df2): - expr = left.join(right, left.key == right.key, how=how).select(left, right.key3) - result = expr.execute() - expected = pd.merge(df1, df2, how=how, on="key")[list(left.columns) + ["key3"]] - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_cross_join_project_left_table(left, right, df1, df2): - expr = left.cross_join(right).select(left, right.key3) - result = expr.execute() - expected = pd.merge( - df1.assign(dummy=1), df2.assign(dummy=1), how="inner", on="dummy" - ).rename(columns={"key_x": "key"})[list(left.columns) + ["key3"]] - tm.assert_frame_equal(result[expected.columns], expected) - - -@pytest.mark.parametrize( - "how", - [ - pytest.param( - "inner", - marks=pytest.mark.xfail( - condition=is_older_than("pandas", "2.0.0"), reason="different indices" - ), - ), - "left", - "right", - "outer", - ], -) -def test_join_with_multiple_predicates(how, left, right, df1, df2): - expr = left.join( - right, [left.key == right.key, left.key2 == right.key3], how=how - ).select(left, right.key3, right.other_value) - result = expr.execute() - expected = pd.merge( - df1, - df2, - how=how, - left_on=["key", "key2"], - right_on=["key", "key3"], - suffixes=("_left", "_right"), - ).reset_index(drop=True) - - expected_columns = ["key", "value", "key2", "key3", "other_value"] - expected = expected[expected_columns] - if how == "right": - # the ibis expression references the `key` column from the left table - # which is not present in the result of the right join, but pandas - # includes the column from the right table - expected["key"] = pd.Series([np.nan, np.nan, np.nan], dtype=object) - elif how == "outer": - expected["key"] = pd.Series(["a", np.nan, "b", np.nan, "c", "d"], dtype=object) - - assert list(result.columns) == expected_columns - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "how", - [ - pytest.param( - "inner", - marks=pytest.mark.xfail( - condition=is_older_than("pandas", "2.0.0"), reason="different indices" - ), - ), - "left", - "right", - "outer", - ], -) -def test_join_with_multiple_predicates_written_as_one(how, left, right, df1, df2): - predicate = (left.key == right.key) & (left.key2 == right.key3) - expr = left.join(right, predicate, how=how).select( - left, right.key3, right.other_value - ) - result = expr.execute() - expected = pd.merge( - df1, df2, how=how, left_on=["key", "key2"], right_on=["key", "key3"] - ).reset_index(drop=True) - - if how == "right": - expected["key"] = pd.Series([np.nan, np.nan], dtype=object) - elif how == "outer": - expected["key"] = pd.Series(["a", np.nan, "b", np.nan, "c", "d"], dtype=object) - - tm.assert_frame_equal(result[expected.columns], expected) - - -@mutating_join_type -def test_join_with_invalid_predicates(how, left, right): - predicate = (left.key == right.key) & (left.key2 <= right.key3) - expr = left.join(right, predicate, how=how) - with pytest.raises(TypeError): - expr.execute() - - predicate = left.key >= right.key - expr = left.join(right, predicate, how=how) - with pytest.raises(TypeError): - expr.execute() - - -@mutating_join_type -@pytest.mark.xfail(reason="Hard to detect this case") -def test_join_with_duplicate_non_key_columns(how, left, right): - left = left.mutate(x=left.value * 2) - right = right.mutate(x=right.other_value * 3) - expr = left.join(right, left.key == right.key, how=how) - - # This is undefined behavior because `x` is duplicated. This is difficult - # to detect - with pytest.raises(ValueError): - expr.execute() - - -@mutating_join_type -def test_join_with_duplicate_non_key_columns_not_selected(how, left, right, df1, df2): - left = left.mutate(x=left.value * 2) - right = right.mutate(x=right.other_value * 3) - right = right[["key", "other_value"]] - expr = left.join(right, left.key == right.key, how=how).select( - left, right.other_value - ) - result = expr.execute() - expected = pd.merge( - df1.assign(x=df1.value * 2), - df2[["key", "other_value"]], - how=how, - on="key", - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -@mutating_join_type -def test_join_with_post_expression_selection(how, left, right, df1, df2): - join = left.join(right, left.key == right.key, how=how) - expr = join.select(left.key, left.value, right.other_value) - result = expr.execute() - expected = pd.merge(df1, df2, on="key", how=how)[["key", "value", "other_value"]] - tm.assert_frame_equal(result[expected.columns], expected) - - -@mutating_join_type -def test_join_with_post_expression_filter(how, left): - lhs = left[["key", "key2"]] - rhs = left[["key2", "value"]] - - joined = lhs.join(rhs, "key2", how=how) - projected = joined.select(lhs, rhs.value) - expr = projected.filter(projected.value == 4) - result = expr.execute() - - df1 = lhs.execute() - df2 = rhs.execute() - expected = pd.merge(df1, df2, on="key2", how=how) - expected = expected.loc[expected.value == 4].reset_index(drop=True) - - tm.assert_frame_equal(result, expected) - - -@mutating_join_type -def test_multi_join_with_post_expression_filter(how, left, df1): - lhs = left[["key", "key2"]] - rhs = left[["key2", "value"]] - rhs2 = left[["key2", "value"]].rename(value2="value") - - joined = lhs.join(rhs, "key2", how=how) - projected = joined.select(lhs, rhs.value) - filtered = projected.filter(projected.value == 4) - - joined2 = filtered.join(rhs2, "key2") - projected2 = joined2.select(filtered.key, rhs2.value2) - expr = projected2.filter(projected2.value2 == 3) - - result = expr.execute() - - df1 = lhs.execute() - df2 = rhs.execute() - df3 = rhs2.execute() - expected = pd.merge(df1, df2, on="key2", how=how) - expected = expected.loc[expected.value == 4].reset_index(drop=True) - expected = pd.merge(expected, df3, on="key2")[["key", "value2"]] - expected = expected.loc[expected.value2 == 3].reset_index(drop=True) - - tm.assert_frame_equal(result, expected) - - -@mutating_join_type -def test_join_with_non_trivial_key(how, left, right, df1, df2): - # also test that the order of operands in the predicate doesn't matter - join = left.join(right, right.key.length() == left.key.length(), how=how) - expr = join.select(left.key, left.value, right.other_value) - result = expr.execute() - - expected = ( - pd.merge( - df1.assign(key_len=df1.key.str.len()), - df2.assign(key_len=df2.key.str.len()), - on="key_len", - how=how, - ) - .drop(["key_len", "key_y", "key2", "key3"], axis=1) - .rename(columns={"key_x": "key"}) - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -@mutating_join_type -def test_join_with_non_trivial_key_project_table(how, left, right, df1, df2): - # also test that the order of operands in the predicate doesn't matter - join = left.join(right, right.key.length() == left.key.length(), how=how) - expr = join.select(left, right.other_value) - expr = expr.filter(expr.key.length() == 1) - result = expr.execute() - - expected = ( - pd.merge( - df1.assign(key_len=df1.key.str.len()), - df2.assign(key_len=df2.key.str.len()), - on="key_len", - how=how, - ) - .drop(["key_len", "key_y", "key2", "key3"], axis=1) - .rename(columns={"key_x": "key"}) - ) - expected = expected.loc[expected.key.str.len() == 1] - tm.assert_frame_equal(result[expected.columns], expected) - - -@mutating_join_type -def test_join_with_project_right_duplicate_column(client, how, left, df1, df3): - # also test that the order of operands in the predicate doesn't matter - right = client.table("df3") - join = left.join(right, ["key"], how=how) - expr = join.select(left.key, right.key2, right.other_value) - result = expr.execute() - - expected = ( - pd.merge(df1, df3, on="key", how=how) - .drop(["key2_x", "key3", "value"], axis=1) - .rename(columns={"key2_y": "key2"}) - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_join_with_window_function(players_base, players_df, batting, batting_df): - players = players_base - - # this should be semi_join - tbl = batting.left_join(players, ["playerID"]) - t = tbl.select(batting.G, batting.playerID, batting.teamID) - expr = t.group_by(t.teamID).mutate( - team_avg=lambda d: d.G.mean(), - demeaned_by_player=lambda d: d.G - d.G.mean(), - ) - result = expr.execute() - - expected = pd.merge( - batting_df, players_df[["playerID"]], on="playerID", how="left" - )[["G", "playerID", "teamID"]] - team_avg = expected.groupby("teamID").G.transform("mean") - expected = expected.assign( - team_avg=team_avg, demeaned_by_player=lambda df: df.G - team_avg - ) - - tm.assert_frame_equal(result[expected.columns], expected) - - -merge_asof_minversion = pytest.mark.skipif( - pd.__version__ < "0.19.2", - reason="at least pandas-0.19.2 required for merge_asof", -) - - -@merge_asof_minversion -def test_asof_join(time_left, time_right, time_df1, time_df2): - expr = time_left.asof_join(time_right, "time") - result = expr.execute() - expected = pd.merge_asof(time_df1, time_df2, on="time") - tm.assert_frame_equal(result[expected.columns], expected) - with pytest.raises(AssertionError): - tm.assert_series_equal(result["time"], result["time_right"]) - - -@merge_asof_minversion -def test_asof_join_predicate(time_left, time_right, time_df1, time_df2): - expr = time_left.asof_join(time_right, time_left.time == time_right.time) - result = expr.execute() - expected = pd.merge_asof( - time_df1, time_df2, on="time", direction="nearest", allow_exact_matches=True - ) - tm.assert_frame_equal(result[expected.columns], expected) - with pytest.raises(AssertionError): - tm.assert_series_equal(result["time"], result["time_right"]) - - -@merge_asof_minversion -def test_keyed_asof_join( - time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 -): - expr = time_keyed_left.asof_join(time_keyed_right, "time", predicates="key") - expr = expr.select(time_keyed_left, time_keyed_right.other_value) - result = expr.execute() - expected = pd.merge_asof(time_keyed_df1, time_keyed_df2, on="time", by="key") - tm.assert_frame_equal(result[expected.columns], expected) - - -@merge_asof_minversion -def test_keyed_asof_join_with_tolerance( - time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 -): - expr = time_keyed_left.asof_join( - time_keyed_right, "time", predicates="key", tolerance=2 * ibis.interval(days=1) - ) - result = expr.execute() - expected = pd.merge_asof( - time_keyed_df1, - time_keyed_df2, - on="time", - by="key", - tolerance=pd.Timedelta("2D"), - ) - tm.assert_frame_equal(result[expected.columns], expected) - with pytest.raises(AssertionError): - tm.assert_series_equal(result["time"], result["time_right"]) - with pytest.raises(AssertionError): - tm.assert_series_equal(result["key"], result["key_right"]) - - -@merge_asof_minversion -def test_asof_join_overlapping_non_predicate( - time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 -): - # Add a junk column with a colliding name - time_keyed_left = time_keyed_left.mutate( - collide=time_keyed_left.key + time_keyed_left.value - ) - time_keyed_right = time_keyed_right.mutate( - collide=time_keyed_right.key + time_keyed_right.other_value - ) - time_keyed_df1.assign(collide=time_keyed_df1["key"] + time_keyed_df1["value"]) - time_keyed_df2.assign(collide=time_keyed_df2["key"] + time_keyed_df2["other_value"]) - - expr = time_keyed_left.asof_join( - time_keyed_right, on=("time", "time"), predicates=[("key", "key")] - ) - result = expr.execute() - expected = pd.merge_asof( - time_keyed_df1, time_keyed_df2, on="time", by="key", suffixes=("", "_right") - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -@pytest.mark.parametrize( - "how", - [ - "left", - "right", - "inner", - "outer", - ], -) -@pytest.mark.parametrize( - "func", - [ - pytest.param(lambda join: join["a0", "a1"], id="tuple"), - pytest.param(lambda join: join[["a0", "a1"]], id="list"), - pytest.param(lambda join: join.select(["a0", "a1"]), id="select"), - ], -) -def test_select_on_unambiguous_join(how, func): - df_t = pd.DataFrame({"a0": [1, 2, 3], "b1": list("aab")}) - df_s = pd.DataFrame({"a1": [2, 3, 4], "b2": list("abc")}) - con = ibis.pandas.connect({"t": df_t, "s": df_s}) - t = con.table("t") - s = con.table("s") - method = getattr(t, f"{how}_join") - join = method(s, t.b1 == s.b2) - expected = pd.merge(df_t, df_s, left_on=["b1"], right_on=["b2"], how=how)[ - ["a0", "a1"] - ] - assert not expected.empty - expr = func(join) - result = expr.execute() - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "func", - [ - pytest.param(lambda join: join["a0", "a1"], id="tuple"), - pytest.param(lambda join: join[["a0", "a1"]], id="list"), - pytest.param(lambda join: join.select(["a0", "a1"]), id="select"), - ], -) -@merge_asof_minversion -def test_select_on_unambiguous_asof_join(func): - df_t = pd.DataFrame({"a0": [1, 2, 3], "b1": pd.date_range("20180101", periods=3)}) - df_s = pd.DataFrame({"a1": [2, 3, 4], "b2": pd.date_range("20171230", periods=3)}) - con = ibis.pandas.connect({"t": df_t, "s": df_s}) - t = con.table("t") - s = con.table("s") - join = t.asof_join(s, t.b1 == s.b2) - expected = pd.merge_asof(df_t, df_s, left_on=["b1"], right_on=["b2"])[["a0", "a1"]] - assert not expected.empty - expr = func(join) - result = expr.execute() - tm.assert_frame_equal(result, expected) - - -def test_outer_join(): - df = pd.DataFrame({"test": [1, 2, 3], "name": ["a", "b", "c"]}) - df_2 = pd.DataFrame({"test_2": [1, 5, 6], "name_2": ["d", "e", "f"]}) - - conn = ibis.pandas.connect({"df": df, "df_2": df_2}) - - ibis_table_1 = conn.table("df") - ibis_table_2 = conn.table("df_2") - - joined = ibis_table_1.outer_join( - ibis_table_2, - predicates=ibis_table_1["test"] == ibis_table_2["test_2"], - ) - result = joined.execute() - expected = pd.merge( - df, - df_2, - left_on="test", - right_on="test_2", - how="outer", - ) - tm.assert_frame_equal(result, expected) - - -def test_mutate_after_join(): - # GH3090 - df = pd.DataFrame( - { - "p_Order_Priority": ["C", "H", "L", "M"], - "p_count": [9, 9, 15, 11], - "p_density": [0.204545, 0.204545, 0.340909, 0.250000], - } - ) - df_2 = pd.DataFrame( - { - "q_Order_Priority": ["C", "H", "L", "M"], - "q_count": [13, 21, 12, 10], - "q_density": [0.232143, 0.375000, 0.214286, 0.178571], - } - ) - - conn = ibis.pandas.connect({"df": df, "df_2": df_2}) - - ibis_table_1 = conn.table("df") - ibis_table_2 = conn.table("df_2") - - joined = ibis_table_1.outer_join( - ibis_table_2, - predicates=( - ibis_table_1["p_Order_Priority"] == ibis_table_2["q_Order_Priority"] - ), - ) - - joined = joined.mutate( - bins=( - joined["p_Order_Priority"] - .isnull() - .ifelse(joined["q_Order_Priority"], joined["p_Order_Priority"]) - ), - p_count=joined["p_count"].fill_null(0), - q_count=joined["q_count"].fill_null(0), - p_density=joined.p_density.fill_null(1e-10), - q_density=joined.q_density.fill_null(1e-10), - features=ibis.literal("Order_Priority"), - ) - - expected = pd.DataFrame( - { - "p_Order_Priority": list("CHLM"), - "p_count": [9, 9, 15, 11], - "p_density": [0.204545, 0.204545, 0.340909, 0.250000], - "q_Order_Priority": list("CHLM"), - "q_count": [13, 21, 12, 10], - "q_density": [0.232143, 0.375000, 0.214286, 0.178571], - "bins": list("CHLM"), - "features": ["Order_Priority"] * 4, - } - ) - result = joined.execute() - tm.assert_frame_equal(result, expected) - - -@pytest.fixture -def tracts_df(): - return pd.DataFrame( - [[1, 1], [2, 1], [3, 2], [4, 2], [5, 3], [6, 4]], - columns=["tract_id", "tract_farm_id"], - ) - - -@pytest.fixture -def fields_df(): - vals = [ - [1, 1, "[(0, 2), (1, 3), (2, 0), (3, 1)]"], - [2, 1, "[(2, 2), (3, 2), (3, 1)]"], - [3, 2, "[(0, 1), (-1, 0), (-2, 0), (-2, 1)]"], - [4, 3, "[(0, 1), (1, 1), (1, 2), (0, 2)]"], - [5, 3, "[(1, 0), (2, 0), (2, 3), (1, 3)]"], - [6, 3, "[(2, 0), (3, 0), (3, 2), (2, 2)]"], - [7, 4, "[(-1, -1), (0, -1), (0, -2)]"], - [8, 4, "[(1, 0), (1, -2), (0, -2), (0, -1)]"], - [ - 9, - 5, - str( - [ - (1, 0), - (2, 0), - (1, -1), - (1, -2), - (-1, -2), - (-1, -1), - (-2, 0), - (-1, 0), - (0, -1), - ] - ), - ], - [10, 6, "[(-1, 2), (0, 2), (0, 0), (-1, 0)]"], - [11, 6, "[(0, 2), (1, 2), (1, 1), (0, 1)]"], - ] - return pd.DataFrame( - vals, - columns=["field_id", "field_tract_id", "field_vertices"], - ) - - -@pytest.fixture -def harvest_df(): - vals = [ - [1, 1, 1, 1, 1, 65.80], - [2, 2, 1, 2, 2, 5750.00], - [3, 3, 1, 1, 1, 59.85], - [4, 4, 2, 2, 2, 10100.00], - [5, 5, 2, 1, 1, 90.30], - [6, 6, 2, 2, 2, 21000.00], - [7, 7, 2, 2, 2, 5150.00], - [8, 8, 2, 1, 1, 53.55], - [9, 9, 3, 1, 1, 147.00], - [10, 10, 4, 1, 1, 70.70], - [11, 11, 4, 2, 2, 9600.00], - [12, 1, 1, 2, 4, 22800.00], - [13, 2, 1, 1, 3, 19.25], - [14, 3, 1, 2, 4, 13050.00], - [15, 4, 2, 1, 3, 31.15], - [16, 5, 2, 2, 4, 33000.00], - [17, 6, 2, 1, 3, 64.40], - [18, 7, 2, 1, 3, 16.45], - [19, 8, 2, 2, 4, 15000.00], - [20, 9, 3, 2, 4, 38400.00], - [21, 10, 4, 2, 4, 19800.00], - [22, 11, 4, 1, 3, 34.30], - ] - - return pd.DataFrame( - vals, - columns=[ - "harvest_id", - "harvest_field_id", - "harvest_farmer_group_id", - "harvest_crop_id", - "harvest_date_id", - "harvest_value", - ], - ) - - -def test_multijoin(tracts_df, fields_df, harvest_df): - conn = ibis.pandas.connect( - dict( - tracts=tracts_df, - fields=fields_df, - harvest=harvest_df, - ) - ) - - tracts, fields, harvest = map(conn.table, "tracts fields harvest".split()) - - fielded = harvest.inner_join( - fields, - harvest.harvest_field_id == fields.field_id, - ) - tracted = fielded.inner_join( - tracts, - fielded.field_tract_id == tracts.tract_id, - ) - result = tracted.execute() - - fielded_df = pd.merge( - harvest_df, - fields_df, - left_on="harvest_field_id", - right_on="field_id", - ) - expected = pd.merge( - fielded_df, - tracts_df, - left_on="field_tract_id", - right_on="tract_id", - ) - - tm.assert_frame_equal(result, expected) - - -def test_chain_join(): - test_df1 = pd.DataFrame({"id": ["1", "1"], "value": ["a", "a"]}) - test_df2 = pd.DataFrame({"id": ["1", "1"], "value": ["z", "z"]}) - test_df3 = pd.DataFrame({"id": ["1", "1"], "value": ["z1", "z1"]}) - - conn = ibis.pandas.connect({"df1": test_df1, "df2": test_df2, "df3": test_df3}) - - t1 = conn.table("df1") - t2 = conn.table("df2") - t3 = conn.table("df3") - - expr = ( - t1.join(t2, t1.id == t2.id) - .join(t3, t1.id == t3.id) - .select(t1.id, t1.value, t2.value.name("value2"), t3.value.name("value3")) - ) - result = expr.execute() - - n = len(test_df1) * len(test_df2) * len(test_df3) - expected = pd.DataFrame( - { - "id": ["1"] * n, - "value": ["a"] * n, - "value2": ["z"] * n, - "value3": ["z1"] * n, - } - ) - tm.assert_frame_equal(result, expected) diff --git a/ibis/backends/pandas/tests/test_maps.py b/ibis/backends/pandas/tests/test_maps.py deleted file mode 100644 index c672dc743b40b..0000000000000 --- a/ibis/backends/pandas/tests/test_maps.py +++ /dev/null @@ -1,93 +0,0 @@ -from __future__ import annotations - -import numpy as np -import pandas as pd - -import ibis -from ibis.backends.pandas.tests.conftest import TestConf as tm - - -def test_map_length_expr(t): - expr = t.map_of_integers_strings.length() - result = expr.execute() - expected = pd.Series([0, None, 2], name="map_of_integers_strings") - tm.assert_series_equal(result, expected) - - -def test_map_value_for_key_expr(t): - expr = t.map_of_integers_strings[1] - result = expr.execute() - expected = pd.Series([None, None, "a"], name="map_of_integers_strings") - tm.assert_series_equal(result, expected) - - -def test_map_value_or_default_for_key_expr(t): - expr = t.map_of_complex_values.get("a") - result = expr.execute() - expected = pd.Series( - [None, [1, 2, 3], None], dtype="object", name="map_of_complex_values" - ) - tm.assert_series_equal(result, expected) - - -def safe_sorter(element): - return np.sort(element) if isinstance(element, np.ndarray) else element - - -def test_map_keys_expr(t): - expr = t.map_of_strings_integers.keys() - result = expr.execute().map(safe_sorter) - expected = pd.Series( - np.array([["a", "b"], None, []], dtype="object"), - dtype="object", - name="map_of_strings_integers", - ) - tm.assert_series_equal(result, expected) - - -def test_map_keys_scalar(client, t): - expr = ibis.literal({"a": 10, "b": 50, "c": 20, "d": 40}) - expr = expr.keys() - result = client.execute(expr) - expected = np.array(["a", "b", "c", "d"]) - np.testing.assert_array_equal(result, expected) - - -def test_map_values_expr(t): - expr = t.map_of_complex_values.values() - result = expr.execute().map(safe_sorter) - expected = pd.Series( - [None, [[1, 2, 3], []], []], dtype="object", name="map_of_complex_values" - ) - tm.assert_series_equal(result, expected) - - -def test_map_values_scalar(client, t): - expr = ibis.literal({"a": 10, "b": 50, "c": 20, "d": 40}) - expr = expr.values() - result = client.execute(expr) - expected = np.array([10, 50, 20, 40]) - np.testing.assert_array_equal(result, expected) - - -def test_map_concat_expr(t): - expr = t.map_of_complex_values + {"b": [4, 5, 6], "c": [], "a": []} - result = expr.execute() - expected = pd.Series( - [ - None, - {"a": [], "b": [4, 5, 6], "c": []}, - {"b": [4, 5, 6], "c": [], "a": []}, - ], - dtype="object", - name="map_of_complex_values", - ) - tm.assert_series_equal(result, expected) - - -def test_map_value_for_key_literal_broadcast(t): - lookup_table = ibis.literal({"a": 1, "b": 2, "c": 3, "d": 4}) - expr = lookup_table.get(t.dup_strings) - result = expr.execute() - expected = pd.Series([4, 1, 4], name="dup_strings") - tm.assert_series_equal(result, expected.astype(expr.type().to_pandas())) diff --git a/ibis/backends/pandas/tests/test_operations.py b/ibis/backends/pandas/tests/test_operations.py deleted file mode 100644 index 6e56472a92642..0000000000000 --- a/ibis/backends/pandas/tests/test_operations.py +++ /dev/null @@ -1,828 +0,0 @@ -from __future__ import annotations - -import operator -from operator import methodcaller - -import numpy as np -import numpy.testing as npt -import pandas as pd -import pytest -from pytest import param - -import ibis -import ibis.expr.datatypes as dt -from ibis import _ -from ibis.backends.pandas import Backend -from ibis.backends.pandas.tests.conftest import TestConf as tm - - -def test_table_column(t, df): - expr = t.plain_int64 - result = expr.execute() - expected = df.plain_int64 - tm.assert_series_equal(result, expected) - - -def test_literal(client): - assert client.execute(ibis.literal(1)) == 1 - - -def test_selection(t, df): - expr = t.filter( - ((t.plain_strings == "a") | (t.plain_int64 == 3)) & (t.dup_strings == "d") - ) - result = expr.execute() - expected = df[ - ((df.plain_strings == "a") | (df.plain_int64 == 3)) & (df.dup_strings == "d") - ].reset_index(drop=True) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_mutate(t, df): - expr = t.mutate(x=t.plain_int64 + 1, y=t.plain_int64 * 2) - result = expr.execute() - expected = df.assign(x=df.plain_int64 + 1, y=df.plain_int64 * 2) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_project_scope_does_not_override(t, df): - col = t.plain_int64 - expr = t.select( - col.name("new_col"), - col.sum().over(ibis.window(group_by="dup_strings")).name("grouped"), - ) - result = expr.execute() - expected = pd.concat( - [ - df[["plain_int64", "dup_strings"]].rename( - columns={"plain_int64": "new_col"} - ), - df.groupby("dup_strings") - .plain_int64.transform("sum") - .reset_index(drop=True) - .rename("grouped"), - ], - axis=1, - )[["new_col", "grouped"]] - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "where", - [ - param(lambda _: None, id="none"), - param(lambda t: t.dup_strings == "d", id="simple"), - param(lambda t: (t.dup_strings == "d") | (t.plain_int64 < 100), id="complex"), - ], -) -@pytest.mark.parametrize( - ("ibis_func", "pandas_func"), - [ - param(methodcaller("abs"), np.abs, id="abs"), - param(methodcaller("ceil"), np.ceil, id="ceil"), - param(methodcaller("exp"), np.exp, id="exp"), - param(methodcaller("floor"), np.floor, id="floor"), - param(methodcaller("ln"), np.log, id="log"), - param(methodcaller("log10"), np.log10, id="log10"), - param(methodcaller("log", 2), lambda x: np.log(x) / np.log(2), id="logb"), - param(methodcaller("log2"), np.log2, id="log2"), - param( - methodcaller("round", 0), lambda x: x.round(0).astype("int64"), id="round0" - ), - param(methodcaller("round", -2), methodcaller("round", -2), id="roundm2"), - param(methodcaller("round", 2), methodcaller("round", 2), id="round2"), - param(methodcaller("round"), lambda x: x.round().astype("int64"), id="round"), - param(methodcaller("sign"), np.sign, id="sign"), - param(methodcaller("sqrt"), np.sqrt, id="sqrt"), - ], -) -def test_aggregation_group_by(t, df, where, ibis_func, pandas_func): - ibis_where = where(t) - expr = t.group_by(t.dup_strings).aggregate( - avg_plain_int64=t.plain_int64.mean(where=ibis_where), - sum_plain_float64=t.plain_float64.sum(where=ibis_where), - mean_float64_positive=ibis_func(t.float64_positive).mean(where=ibis_where), - neg_mean_int64_with_zeros=(-t.int64_with_zeros).mean(where=ibis_where), - nunique_dup_ints=t.dup_ints.nunique(), - ) - result = expr.execute() - - pandas_where = where(df) - mask = slice(None) if pandas_where is None else pandas_where - expected = ( - df.groupby("dup_strings") - .agg( - { - "plain_int64": lambda x, mask=mask: x[mask].mean(), - "plain_float64": lambda x, mask=mask: x[mask].sum(), - "dup_ints": "nunique", - "float64_positive": ( - lambda x, mask=mask, func=pandas_func: func(x[mask]).mean() - ), - "int64_with_zeros": lambda x, mask=mask: (-x[mask]).mean(), - } - ) - .reset_index() - .rename( - columns={ - "plain_int64": "avg_plain_int64", - "plain_float64": "sum_plain_float64", - "dup_ints": "nunique_dup_ints", - "float64_positive": "mean_float64_positive", - "int64_with_zeros": "neg_mean_int64_with_zeros", - } - ) - ) - lhs = result[expected.columns] - rhs = expected - tm.assert_frame_equal(lhs, rhs) - - -def test_aggregation_without_group_by(t, df): - expr = t.aggregate( - avg_plain_int64=t.plain_int64.mean(), - sum_plain_float64=t.plain_float64.sum(), - ) - result = expr.execute()[["avg_plain_int64", "sum_plain_float64"]] - new_names = { - "plain_float64": "sum_plain_float64", - "plain_int64": "avg_plain_int64", - } - expected = ( - pd.Series( - [df["plain_int64"].mean(), df["plain_float64"].sum()], - index=["plain_int64", "plain_float64"], - ) - .to_frame() - .T.rename(columns=new_names) - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_group_by_with_having(t, df): - expr = ( - t.group_by(t.dup_strings) - .having(t.plain_float64.sum() == 5) - .aggregate(avg_a=t.plain_int64.mean(), sum_c=t.plain_float64.sum()) - ) - result = expr.execute() - - expected = ( - df.groupby("dup_strings") - .agg({"plain_int64": "mean", "plain_float64": "sum"}) - .reset_index() - .rename(columns={"plain_int64": "avg_a", "plain_float64": "sum_c"}) - ) - expected = expected.loc[expected.sum_c == 5, ["avg_a", "sum_c"]] - - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_group_by_rename_key(t, df): - expr = t.group_by(t.dup_strings.name("foo")).aggregate( - dup_string_count=t.dup_strings.count() - ) - assert "foo" in expr.schema() - result = expr.execute() - assert "foo" in result.columns - - expected = ( - df.groupby("dup_strings") - .dup_strings.count() - .rename("dup_string_count") - .reset_index() - .rename(columns={"dup_strings": "foo"}) - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("reduction", ["mean", "sum", "count", "std", "var"]) -@pytest.mark.parametrize( - "where", - [ - lambda t: (t.plain_strings == "a") | (t.plain_strings == "c"), - lambda t: (t.dup_strings == "d") - & ((t.plain_int64 == 1) | (t.plain_int64 == 3)), - lambda t: None, - ], -) -def test_reduction(t, df, reduction, where): - func = getattr(t.plain_int64, reduction) - mask = where(t) - expr = func(where=mask) - result = expr.execute() - - df_mask = where(df) - expected_func = getattr( - df.loc[df_mask if df_mask is not None else slice(None), "plain_int64"], - reduction, - ) - expected = expected_func() - assert result == expected - - -@pytest.mark.parametrize( - "reduction", - [ - lambda x: x.any(), - lambda x: x.all(), - lambda x: ~(x.any()), - lambda x: ~(x.all()), - ], -) -def test_boolean_aggregation(t, df, reduction): - expr = reduction(t.plain_int64 == 1) - result = expr.execute() - expected = reduction(df.plain_int64 == 1) - assert result == expected - - -@pytest.mark.parametrize("column", ["float64_with_zeros", "int64_with_zeros"]) -def test_nullif_zero(t, df, column): - expr = t[column].nullif(0) - result = expr.execute() - expected = df[column].replace(0, np.nan) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ("left", "right", "expected", "compare"), - [ - param( - lambda t: ibis.literal(1), - lambda t: ibis.literal(1), - lambda df: np.nan, - np.testing.assert_array_equal, # treats NaNs as equal - id="literal_literal_equal", - ), - param( - lambda t: ibis.literal(1), - lambda t: ibis.literal(2), - lambda df: 1, - np.testing.assert_equal, - id="literal_literal_not_equal", - ), - param( - lambda t: t.dup_strings, - lambda t: ibis.literal("a"), - lambda df: df.dup_strings.where(df.dup_strings != "a"), - tm.assert_series_equal, - id="series_literal", - ), - param( - lambda t: t.dup_strings, - lambda t: t.dup_strings, - lambda df: df.dup_strings.where(df.dup_strings != df.dup_strings), - tm.assert_series_equal, - id="series_series", - ), - param( - lambda t: ibis.literal("a"), - lambda t: t.dup_strings, - lambda _: pd.Series(["a", np.nan, "a"], name="dup_strings"), - tm.assert_series_equal, - id="literal_series", - ), - ], -) -def test_nullif(t, df, left, right, expected, compare): - expr = left(t).nullif(right(t)) - result = Backend().execute(expr) - compare(result, expected(df)) - - -def test_nullif_inf(): - df = pd.DataFrame({"a": [np.inf, 3.14, -np.inf, 42.0]}) - con = Backend().connect({"t": df}) - t = con.table("t") - expr = t.a.nullif(np.inf).nullif(-np.inf) - result = expr.execute() - expected = pd.Series([np.nan, 3.14, np.nan, 42.0], name="a") - tm.assert_series_equal(result, expected) - - -def test_group_concat(t, df): - expr = t.group_by(t.dup_strings).aggregate(foo=t.plain_int64.group_concat(",")) - result = expr.execute() - expected = ( - df.groupby("dup_strings") - .apply(lambda df: ",".join(df.plain_int64.astype(str))) - .reset_index() - .rename(columns={0: "foo"}) - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -@pytest.mark.parametrize("offset", [0, 2]) -def test_frame_limit(t, df, offset): - n = 5 - df_expr = t.limit(n, offset=offset) - result = df_expr.execute() - expected = df.iloc[offset : offset + n].reset_index(drop=True) - tm.assert_frame_equal(result[expected.columns], expected) - - -@pytest.mark.parametrize("offset", [0, 2]) -def test_series_limit(t, df, offset): - with pytest.raises(AttributeError): - t.plain_int64.limit(5, offset=offset) - - -@pytest.mark.parametrize( - ("key", "pandas_by", "pandas_ascending"), - [ - (lambda t, col: [ibis.desc(t[col])], lambda col: [col], False), - ( - lambda t, col: [t[col], ibis.desc(t.plain_int64)], - lambda col: [col, "plain_int64"], - [True, False], - ), - ( - lambda t, col: [ibis.desc(t.plain_int64 * 2)], - lambda col: ["plain_int64"], - False, - ), - ], -) -@pytest.mark.parametrize( - "column", - ["plain_datetimes_naive", "plain_datetimes_ny", "plain_datetimes_utc"], -) -def test_order_by(t, df, column, key, pandas_by, pandas_ascending): - expr = t.order_by(key(t, column)) - result = expr.execute() - expected = df.sort_values( - pandas_by(column), ascending=pandas_ascending - ).reset_index(drop=True) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_complex_order_by(t, df): - expr = t.order_by([ibis.desc(t.plain_int64 * t.plain_float64), t.plain_float64]) - result = expr.execute() - expected = ( - df.assign(foo=df.plain_int64 * df.plain_float64) - .sort_values(["foo", "plain_float64"], ascending=[False, True]) - .drop(["foo"], axis=1) - .reset_index(drop=True) - ) - - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_count_distinct(t, df): - expr = t.dup_strings.nunique() - result = expr.execute() - expected = df.dup_strings.nunique() - assert result == expected - - -def test_value_counts(t, df): - expr = t.dup_strings.value_counts() - result = expr.execute() - expected = ( - df.dup_strings.value_counts() - .rename("dup_strings") - .reset_index(name="dup_strings_count") - .rename(columns={"index": "dup_strings"}) - .sort_values(["dup_strings"]) - .reset_index(drop=True) - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_table_count(t, df): - expr = t.count() - result = expr.execute() - expected = len(df) - assert result == expected - - -def test_weighted_average(t, df): - expr = t.group_by(t.dup_strings).aggregate( - avg=(t.plain_float64 * t.plain_int64).sum() / t.plain_int64.sum() - ) - result = expr.execute() - expected = ( - df.groupby("dup_strings") - .apply( - lambda df: (df.plain_int64 * df.plain_float64).sum() / df.plain_int64.sum() - ) - .reset_index() - .rename(columns={0: "avg"}) - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_group_by_multiple_keys(t, df): - expr = t.group_by([t.dup_strings, t.dup_ints]).aggregate( - avg_plain_float64=t.plain_float64.mean() - ) - result = expr.execute() - expected = ( - df.groupby(["dup_strings", "dup_ints"]) - .agg({"plain_float64": "mean"}) - .reset_index() - .rename(columns={"plain_float64": "avg_plain_float64"}) - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_mutate_after_group_by(t, df): - gb = t.group_by(t.dup_strings).aggregate(avg_plain_float64=t.plain_float64.mean()) - expr = gb.mutate(x=gb.avg_plain_float64) - result = expr.execute() - expected = ( - df.groupby("dup_strings") - .agg({"plain_float64": "mean"}) - .reset_index() - .rename(columns={"plain_float64": "avg_plain_float64"}) - ) - expected = expected.assign(x=expected.avg_plain_float64) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_groupby_with_unnamed_arithmetic(t, df): - expr = t.group_by(t.dup_strings).aggregate( - naive_variance=((t.plain_float64**2).sum() - t.plain_float64.mean() ** 2) - / t.plain_float64.count() - ) - result = expr.execute() - expected = ( - df.groupby("dup_strings") - .agg({"plain_float64": lambda x: ((x**2).sum() - x.mean() ** 2) / x.count()}) - .reset_index() - .rename(columns={"plain_float64": "naive_variance"}) - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_isnull(t, df): - expr = t.strings_with_nulls.isnull() - result = expr.execute() - expected = df.strings_with_nulls.isnull() - tm.assert_series_equal(result, expected) - - -def test_notnull(t, df): - expr = t.strings_with_nulls.notnull() - result = expr.execute() - expected = df.strings_with_nulls.notnull() - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("raw_value", [0.0, 1.0]) -def test_scalar_parameter(t, df, raw_value): - value = ibis.param(dt.double) - expr = t.float64_with_zeros == value - result = expr.execute(params={value: raw_value}) - expected = df.float64_with_zeros == raw_value - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("elements", [[1], (1,), {1}, frozenset({1})]) -def test_isin(t, df, elements): - expr = t.plain_float64.isin(elements) - expected = df.plain_float64.isin(elements) - result = expr.execute() - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("elements", [[1], (1,), {1}, frozenset({1})]) -def test_notin(t, df, elements): - expr = t.plain_float64.notin(elements) - expected = ~df.plain_float64.isin(elements) - result = expr.execute() - tm.assert_series_equal(result, expected) - - -def test_cast_on_group_by(t, df): - expr = t.group_by(t.dup_strings).aggregate( - casted=(t.float64_with_zeros == 0).cast("int64").sum() - ) - result = expr.execute() - expected = ( - df.groupby("dup_strings") - .float64_with_zeros.apply(lambda s: (s == 0).astype("int64").sum()) - .reset_index() - .rename(columns={"float64_with_zeros": "casted"}) - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "opname", ["add", "mul", "sub", "truediv", "floordiv", "mod", "pow"] -) -@pytest.mark.parametrize( - "argfunc", [param(lambda c: (1.0, c), id="1c"), param(lambda c: (c, 1.0), id="c1")] -) -def test_left_binary_op(t, df, opname, argfunc): - op = getattr(operator, opname) - left, right = argfunc(t.float64_with_zeros) - expr = op(left, right) - result = expr.execute() - expected = op(*argfunc(df.float64_with_zeros)) - tm.assert_series_equal( - result, - expected, - check_dtype=not (isinstance(right, float) and opname == "floordiv"), - ) - - -@pytest.mark.parametrize( - "opname", ["add", "mul", "sub", "truediv", "floordiv", "mod", "pow"] -) -@pytest.mark.parametrize( - "argfunc", [param(lambda c: (1.0, c), id="1c"), param(lambda c: (c, 1.0), id="c1")] -) -def test_left_binary_op_gb(t, df, opname, argfunc): - op = getattr(operator, opname) - left, right = argfunc(t.float64_with_zeros) - expr = t.group_by("dup_strings").aggregate(foo=op(left, right).sum()) - result = expr.execute() - expected = ( - df.groupby("dup_strings") - .float64_with_zeros.apply(lambda s: op(*argfunc(s)).sum()) - .reset_index() - .rename(columns={"float64_with_zeros": "foo"}) - ) - tm.assert_frame_equal( - result, - expected, - check_dtype=not (isinstance(right, float) and opname == "floordiv"), - ) - - -@pytest.mark.parametrize( - "left_f", - [ - param(lambda e: e - 1, id="sub"), - param(lambda _: 0.0, id="zero"), - param(lambda _: None, id="none"), - ], -) -@pytest.mark.parametrize( - "right_f", - [ - param(lambda e: e + 1, id="add"), - param(lambda _: 1.0, id="one"), - param(lambda _: None, id="none"), - ], -) -def test_ifelse_series(t, df, left_f, right_f): - col_expr = t["plain_int64"] - result = ibis.ifelse( - col_expr > col_expr.mean(), left_f(col_expr), right_f(col_expr) - ).execute() - - series = df["plain_int64"] - cond = series > series.mean() - left = left_f(series) - if not isinstance(left, pd.Series): - left = pd.Series(np.repeat(left, len(cond)), name=cond.name) - expected = left.where(cond, right_f(series)) - - tm.assert_series_equal( - result.astype(object).fillna(pd.NA), - expected.astype(object).fillna(pd.NA), - check_dtype=False, - ) - - -@pytest.mark.parametrize( - ("cond", "expected_func"), - [ - param(True, lambda df: df["plain_int64"].astype("float64"), id="true"), - param(False, lambda df: pd.Series(np.repeat(3.0, len(df))), id="false"), - ], -) -def test_ifelse_scalar(t, df, cond, expected_func): - expr = ibis.ifelse(cond, t["plain_int64"], 3.0) - result = expr.execute() - expected = expected_func(df) - tm.assert_series_equal(result, expected) - - -def test_ifelse_long(batting, batting_df): - col_expr = batting["AB"] - result = ibis.ifelse(col_expr > col_expr.mean(), col_expr, 0.0).execute() - - series = batting_df["AB"] - expected = series.where(series > series.mean(), other=0.0).astype("float64") - - tm.assert_series_equal(result, expected) - - -def test_round(t, df): - precision = 2 - mult = 3.33333 - result = (t.count() * mult).round(precision).execute() - expected = np.around(len(df) * mult, precision) - npt.assert_almost_equal(result, expected, decimal=precision) - - -def test_quantile_groupby(batting, batting_df): - def q_fun(x, quantile): - res = x.quantile(quantile).tolist() - return [res for _ in range(len(x))] - - frac = 0.2 - result = ( - batting.group_by("teamID") - .mutate(res=lambda x: x.RBI.quantile([frac, 1 - frac])) - .res.execute() - ) - expected = ( - batting_df.groupby("teamID") - .RBI.transform(q_fun, quantile=[frac, 1 - frac]) - .rename("res") - ) - tm.assert_series_equal(result, expected) - - -def test_summary_numeric(batting, batting_df): - expr = batting.aggregate( - count=_.G.count(), - nulls=_.G.isnull().sum(), - min=_.G.min(), - max=_.G.max(), - sum=_.G.sum(), - mean=_.G.mean(), - approx_nunique=_.G.nunique(), - ) - result = expr.execute() - assert len(result) == 1 - - G = batting_df.G - expected = { - "count": G.count(), - "nulls": G.isnull().sum(), - "min": G.min(), - "max": G.max(), - "sum": G.sum(), - "mean": G.mean(), - "approx_nunique": G.nunique(), - } - assert dict(result.iloc[0]) == expected - - -def test_summary_non_numeric(batting, batting_df): - expr = batting.aggregate( - count=_.teamID.count(), - nulls=_.teamID.isnull().sum(), - uniques=_.teamID.nunique(), - ) - result = expr.execute() - assert len(result) == 1 - assert len(result.columns) == 3 - expected = { - "count": batting_df.teamID.count(), - "nulls": batting_df.teamID.isnull().sum(), - "uniques": batting_df.teamID.nunique(), - } - assert dict(result.iloc[0]) == expected - - -def test_non_range_index(): - def do_replace(col): - return col.cases( - ( - (1, "one"), - (2, "two"), - ), - default="unk", - ) - - df = pd.DataFrame( - { - "A": pd.Series({i: i % 3 for i in (0, 1, 2, 4)}), - "B": 0, - } - ) - expr = ( - ibis.pandas.connect({"t": df}) - .table("t") - .mutate(A=lambda t: t["A"].pipe(do_replace)) - ) - assert df.index.equals(expr.execute().index) - - -def test_table_distinct(t, df): - expr = t[["dup_strings"]].distinct() - result = expr.execute() - expected = df[["dup_strings"]].drop_duplicates() - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("distinct", [True, False]) -def test_union(client, df1, distinct): - t = client.table("df1") - expr = t.union(t, distinct=distinct) - result = expr.execute() - expected = df1 if distinct else pd.concat([df1, df1], axis=0, ignore_index=True) - tm.assert_frame_equal(result, expected) - - -def test_intersect(client, df1, intersect_df2): - t1 = client.table("df1") - t2 = client.table("intersect_df2") - expr = t1.intersect(t2) - result = expr.execute() - expected = df1.merge(intersect_df2, on=list(df1.columns)) - tm.assert_frame_equal(result, expected) - - -def test_difference(client, df1, intersect_df2): - t1 = client.table("df1") - t2 = client.table("intersect_df2") - expr = t1.difference(t2) - result = expr.execute() - merged = df1.merge(intersect_df2, on=list(df1.columns), how="outer", indicator=True) - expected = merged[merged["_merge"] != "both"].drop("_merge", axis=1) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "distinct", - [ - param( - True, - marks=pytest.mark.xfail( - raises=TypeError, - reason="Pandas cannot compute the distinct element of an array column", - ), - ), - False, - ], -) -def test_union_with_list_types(t, df, distinct): - expr = t.union(t, distinct=distinct) - result = expr.execute() - expected = df if distinct else pd.concat([df, df], axis=0, ignore_index=True) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "operation", - [ - pytest.param( - lambda column: column + column, - id="sum", - ), - pytest.param( - lambda column: column + 1, - id="sum_scalar", - ), - pytest.param( - lambda column: column - column, - id="subtract", - ), - pytest.param( - lambda column: column - 1, - id="subtract_scalar", - ), - pytest.param( - lambda column: column * column, - id="multiply", - ), - pytest.param( - lambda column: column * 2, - id="multiply_scalar", - ), - pytest.param( - lambda column: column % column, - id="mod", - marks=pytest.mark.xfail( - raises=ZeroDivisionError, - reason=("Ibis cannot modulo divide two unsigned integer columns."), - ), - ), - pytest.param( - lambda column: column % 2, - id="mod_scalar", - ), - pytest.param( - lambda column: column / column, - id="divide", - ), - pytest.param( - lambda column: column / 2, - id="divide_scalar", - ), - pytest.param( - lambda column: column // column, - id="floordivide", - ), - pytest.param( - lambda column: column // 2, - id="floordivide_scalar", - ), - ], -) -def test_unsigned_integers(t, df, operation): - expr = operation(t.plain_uint64) - result = expr.execute() - expected = operation(df.plain_uint64) - expected_dtype = expr.type().to_pandas() - assert result.dtype == expected_dtype - tm.assert_series_equal(result, expected.astype(expected_dtype)) diff --git a/ibis/backends/pandas/tests/test_strings.py b/ibis/backends/pandas/tests/test_strings.py deleted file mode 100644 index 3aa9b4bc5a006..0000000000000 --- a/ibis/backends/pandas/tests/test_strings.py +++ /dev/null @@ -1,184 +0,0 @@ -from __future__ import annotations - -from warnings import catch_warnings - -import numpy as np -import pandas.testing as tm -import pytest -from pytest import param - -import ibis -from ibis.backends.pandas import Backend -from ibis.backends.pandas.kernels import sql_like_to_regex - - -@pytest.mark.parametrize( - ("case_func", "expected_func"), - [ - param( - lambda s: s.length(), - lambda s: s.str.len().astype("int32"), - id="length", - ), - param(lambda s: s.substr(1, 2), lambda s: s.str[1:3], id="substr"), - param(lambda s: s[1:3], lambda s: s.str[1:3], id="slice"), - param( - lambda s: s[s.length() - 1 :], - lambda s: s.str[-1:], - id="expr_slice_begin", - ), - param(lambda s: s[: s.length()], lambda s: s, id="expr_slice_end"), - param( - lambda s: s[s.length() - 2 : s.length() - 1], - lambda s: s.str[-2:-1], - id="expr_slice_begin_end", - ), - param(lambda s: s.strip(), lambda s: s.str.strip(), id="strip"), - param(lambda s: s.lstrip(), lambda s: s.str.lstrip(), id="lstrip"), - param(lambda s: s.rstrip(), lambda s: s.str.rstrip(), id="rstrip"), - param( - lambda s: s.lpad(3, "a"), - lambda s: s.str.pad(3, side="left", fillchar="a"), - id="lpad", - ), - param( - lambda s: s.rpad(3, "b"), - lambda s: s.str.pad(3, side="right", fillchar="b"), - id="rpad", - ), - param(lambda s: s.reverse(), lambda s: s.str[::-1], id="reverse"), - param(lambda s: s.lower(), lambda s: s.str.lower(), id="lower"), - param(lambda s: s.upper(), lambda s: s.str.upper(), id="upper"), - param(lambda s: s.repeat(2), lambda s: s * 2, id="repeat"), - param( - lambda s: s.contains("a"), - lambda s: s.str.contains("a", regex=False), - id="contains", - ), - param( - lambda s: ~(s.contains("a")), - lambda s: ~s.str.contains("a", regex=False), - id="not_contains", - ), - param( - lambda s: s.like("a"), - lambda s: s.str.contains("^a$", regex=True), - id="like", - ), - param( - lambda s: s.re_search("(ab)+"), - lambda s: s.str.contains("(?:ab)+", regex=True), - id="re_search", - ), - param( - lambda s: s.re_search("(ab)+") | s.re_search("d{1,2}ee"), - lambda s: ( - s.str.contains("(?:ab)+", regex=True) | s.str.contains("d{1,2}ee") - ), - id="re_search_or", - ), - param( - lambda s: s + s.rpad(3, "a"), - lambda s: s + s.str.pad(3, side="right", fillchar="a"), - id="rpad2", - ), - param( - lambda s: s.split(" "), - lambda s: s.apply(lambda x: np.array(x.split(" "))), - id="split_spaces", - ), - ], -) -def test_string_ops(t, df, case_func, expected_func): - # ignore matching UserWarnings - with catch_warnings(record=True): - expr = case_func(t.strings_with_space) - result = expr.execute() - series = expected_func(df.strings_with_space) - tm.assert_series_equal(result, series, check_names=False) - - -@pytest.mark.parametrize( - ("pattern", "expected"), - [ - ("%abc", ".*abc"), - ("abc%", "abc.*"), - ("6%", "6.*"), - ("%6%", ".*6.*"), - ("^%6", "%6"), - ("6^%", "6%"), - ("6^%%", "6%.*"), - ("^%%6", "%.*6"), - ("^%^%6", "%%6"), - ("6^%^%", "6%%"), - ("6_", "6."), - ("_6_", ".6."), - ("^_6", "_6"), - ("6^_", "6_"), - ("6^__", "6_."), - ("^__6", "_.6"), - ("^_^_6", "__6"), - ("6^_^_", "6__"), - ("6%_^%_", "6.*.%."), - ("6_^%%_", "6.%.*."), - ("_^%%_%_^%_%_^%^__^%%^_^%%6%_", ".%.*..*.%..*.%_.%.*_%.*6.*."), - ], -) -def test_sql_like_to_regex(pattern, expected): - result = sql_like_to_regex(pattern, escape="^") - assert result == f"^{expected}$" - - -@pytest.mark.parametrize( - ("from_func", "to_func", "from_str", "to_str"), - [ - param( - lambda s: s.translate_from_strings, - lambda s: s.translate_to_strings, - "rmzabcghj", - "lnsovkjfr", - id="from_series_to_series", - ), - param( - lambda s: "abc", - lambda s: s.translate_to_strings, - "abc", - "ovk", - id="from_string_to_series", - ), - param( - lambda s: s.translate_from_strings, - lambda s: "ovk", - "abcg", - "ovko", - id="from_series_to_string", - ), - ], -) -def test_translate( - t, df, from_func: callable, to_func: callable, from_str: str, to_str: str -): - result = t.strings_with_space.translate(from_func(t), to_func(t)).execute() - table = str.maketrans(from_str, to_str) - series = df.strings_with_space.str.translate(table) - tm.assert_series_equal(result, series, check_names=False) - - -def test_string_repeat(t): - int_col = t.plain_int64 - int_lit = ibis.literal(3) - string_col = t.strings_with_space - string_lit = ibis.literal("abc") - - expr1 = string_col.repeat(int_col) - expr2 = string_col.repeat(int_lit) - expr3 = string_lit.repeat(int_col) - expr4 = string_lit.repeat(int_lit) - - con = Backend() - con.execute(expr1) - con.execute(expr2) - con.execute(expr3) - con.execute(expr4) - - # TODO(kszucs): add assertions or rather parametrize the tests above diff --git a/ibis/backends/pandas/tests/test_structs.py b/ibis/backends/pandas/tests/test_structs.py deleted file mode 100644 index 16d997836e5f8..0000000000000 --- a/ibis/backends/pandas/tests/test_structs.py +++ /dev/null @@ -1,91 +0,0 @@ -from __future__ import annotations - -from collections import OrderedDict - -import pandas as pd -import pytest - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.pandas import Backend -from ibis.backends.pandas.tests.conftest import TestConf as tm - - -@pytest.fixture(scope="module") -def value(): - return OrderedDict([("fruit", "pear"), ("weight", 0)]) - - -@pytest.fixture(scope="module") -def struct_client(value): - df = pd.DataFrame( - { - "s": [ - OrderedDict([("fruit", "apple"), ("weight", None)]), - value, - OrderedDict([("fruit", "pear"), ("weight", 1)]), - ], - "key": list("aab"), - "value": [1, 2, 3], - } - ) - return Backend().connect({"t": df}) - - -@pytest.fixture -def struct_table(struct_client): - return struct_client.table( - "t", - schema={ - "s": dt.Struct.from_tuples([("fruit", dt.string), ("weight", dt.int8)]) - }, - ) - - -def test_struct_field_literal(value): - struct = ibis.literal(value) - assert struct.type() == dt.Struct.from_tuples( - [("fruit", dt.string), ("weight", dt.int8)] - ) - con = ibis.pandas.connect() - - expr = struct["fruit"] - result = con.execute(expr) - assert result == "pear" - - expr = struct["weight"] - result = con.execute(expr) - assert result == 0 - - expr = struct.cast("struct") - assert con.execute(expr) == {"fruit": "pear", "weight": 0.0} - - -def test_struct_field_series(struct_table): - t = struct_table - expr = t.s["fruit"] - result = expr.execute() - expected = pd.Series(["apple", "pear", "pear"], name="fruit") - tm.assert_series_equal(result, expected) - - -def test_struct_field_series_group_by_key(struct_table): - t = struct_table - expr = t.group_by(t.s["fruit"]).aggregate(total=t.value.sum()) - result = expr.execute() - expected = pd.DataFrame([("apple", 1), ("pear", 5)], columns=["fruit", "total"]) - tm.assert_frame_equal(result, expected) - - -def test_struct_field_series_group_by_value(struct_table): - t = struct_table - expr = t.group_by(t.key).aggregate(total=t.s["weight"].sum()) - result = expr.execute() - # these are floats because we have a NULL value in the input data - expected = pd.DataFrame([("a", 0.0), ("b", 1.0)], columns=["key", "total"]) - tm.assert_frame_equal( - result, - expected.assign( - total=lambda df: df.total.astype(expr.total.type().to_pandas()) - ), - ) diff --git a/ibis/backends/pandas/tests/test_temporal.py b/ibis/backends/pandas/tests/test_temporal.py deleted file mode 100644 index f8cf670e99f14..0000000000000 --- a/ibis/backends/pandas/tests/test_temporal.py +++ /dev/null @@ -1,180 +0,0 @@ -from __future__ import annotations - -import datetime -from operator import methodcaller - -import numpy as np -import pandas as pd -import pytest -from packaging.version import parse as parse_version -from pytest import param - -import ibis -from ibis import literal as L -from ibis.backends.pandas import Backend -from ibis.backends.pandas.tests.conftest import TestConf as tm -from ibis.expr import datatypes as dt - - -@pytest.mark.parametrize( - ("case_func", "expected_func"), - [ - (lambda v: v.strftime("%Y%m%d"), lambda vt: vt.strftime("%Y%m%d")), - (lambda v: v.year(), lambda vt: vt.year), - (lambda v: v.month(), lambda vt: vt.month), - (lambda v: v.day(), lambda vt: vt.day), - (lambda v: v.hour(), lambda vt: vt.hour), - (lambda v: v.minute(), lambda vt: vt.minute), - (lambda v: v.second(), lambda vt: vt.second), - (lambda v: v.microsecond(), lambda vt: int(vt.microsecond)), - (lambda v: v.millisecond(), lambda vt: int(vt.microsecond / 1e3)), - ] - + [ - (methodcaller("strftime", pattern), methodcaller("strftime", pattern)) - for pattern in [ - "%Y%m%d %H", - 'DD BAR %w FOO "DD"', - 'DD BAR %w FOO "D', - 'DD BAR "%w" FOO "D', - 'DD BAR "%d" FOO "D', - 'DD BAR "%c" FOO "D', - 'DD BAR "%x" FOO "D', - 'DD BAR "%X" FOO "D', - ] - ], -) -def test_timestamp_functions(case_func, expected_func): - con = ibis.pandas.connect() - v = L("2015-09-01 14:48:05.359").cast("timestamp") - vt = datetime.datetime( - year=2015, - month=9, - day=1, - hour=14, - minute=48, - second=5, - microsecond=359000, - ) - result = case_func(v) - expected = expected_func(vt) - assert con.execute(result) == expected - - -@pytest.mark.parametrize( - "column", - ["datetime_strings_naive", "datetime_strings_ny", "datetime_strings_utc"], -) -def test_cast_datetime_strings_to_date(t, df, column): - expr = t[column].cast("date") - result = expr.execute() - expected = pd.to_datetime(df[column]).dt.normalize().dt.tz_localize(None).dt.date - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "column", - ["datetime_strings_naive", "datetime_strings_ny", "datetime_strings_utc"], -) -def test_cast_datetime_strings_to_timestamp(t, df, column): - expr = t[column].cast(dt.Timestamp(scale=9)) - result = expr.execute() - expected = pd.to_datetime(df[column]) - if getattr(expected.dtype, "tz", None) is not None: - expected = expected.dt.tz_convert(None) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - "column", - ["plain_datetimes_naive", "plain_datetimes_ny", "plain_datetimes_utc"], -) -def test_cast_integer_to_temporal_type(t, df, column): - column_type = t[column].type() - expr = t.plain_int64.cast(column_type) - result = expr.execute() - expected = pd.Series( - pd.to_datetime(df.plain_int64.values, unit="s").values, - index=df.index, - name="plain_int64", - ).dt.tz_localize(column_type.timezone) - tm.assert_series_equal(result, expected) - - -def test_cast_integer_to_date(t, df): - expr = t.plain_int64.cast("date") - result = expr.execute() - expected = pd.Series( - pd.to_datetime(df.plain_int64.values, unit="D").date, - index=df.index, - name="plain_int64", - ) - tm.assert_series_equal(result, expected) - - -def test_times_ops(t, df): - result = t.plain_datetimes_naive.time().between("10:00", "10:00").execute() - expected = pd.Series(np.zeros(len(df), dtype=bool)) - tm.assert_series_equal(result, expected) - - result = t.plain_datetimes_naive.time().between("01:00", "02:00").execute() - expected = pd.Series(np.ones(len(df), dtype=bool)) - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ("tz", "rconstruct", "column"), - [ - ("US/Eastern", np.ones, "plain_datetimes_utc"), - ("US/Eastern", np.zeros, "plain_datetimes_naive"), - ("UTC", np.ones, "plain_datetimes_utc"), - ("UTC", np.ones, "plain_datetimes_naive"), - (None, np.ones, "plain_datetimes_utc"), - (None, np.ones, "plain_datetimes_naive"), - ], - ids=lambda x: str(getattr(x, "__name__", x)).lower().replace("/", "_"), -) -def test_times_ops_with_tz(t, df, tz, rconstruct, column): - expected = pd.Series(rconstruct(len(df), dtype=bool)) - time = t[column].time() - expr = time.between("01:00", "02:00", timezone=tz) - result = expr.execute() - tm.assert_series_equal(result, expected) - - # Test that casting behavior is the same as using the timezone kwarg - ts = t[column].cast(dt.Timestamp(timezone=tz)) - expr = ts.time().between("01:00", "02:00") - result = expr.execute() - tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize( - ("op", "expected"), - [ - param(lambda x, y: x + y, lambda x, y: x.values * 2, id="add"), - param(lambda x, y: x - y, lambda x, y: x.values - y.values, id="sub"), - param(lambda x, y: x * 2, lambda x, y: x.values * 2, id="mul"), - param( - lambda x, y: x // 2, - lambda x, y: x.values // 2, - id="floordiv", - marks=pytest.mark.xfail( - parse_version(pd.__version__) < parse_version("0.23.0"), - raises=TypeError, - reason=( - "pandas versions less than 0.23.0 do not support floor " - "division involving timedelta columns" - ), - ), - ), - ], -) -def test_interval_arithmetic(op, expected): - data = pd.timedelta_range("0 days", "10 days", freq="D") - con = Backend().connect( - {"df1": pd.DataFrame({"td": data}), "df2": pd.DataFrame({"td": data})} - ) - t1 = con.table("df1") - expr = op(t1.td, t1.td) - result = expr.execute() - expected = pd.Series(expected(data, data), name="td") - tm.assert_series_equal(result, expected) diff --git a/ibis/backends/pandas/tests/test_udf.py b/ibis/backends/pandas/tests/test_udf.py deleted file mode 100644 index feaf67b27ab0e..0000000000000 --- a/ibis/backends/pandas/tests/test_udf.py +++ /dev/null @@ -1,443 +0,0 @@ -from __future__ import annotations - -import collections - -import numpy as np -import pandas as pd -import pytest -from packaging.version import parse as vparse - -import ibis -import ibis.expr.datatypes as dt -import ibis.expr.types as ir -from ibis.backends.pandas import Backend -from ibis.backends.pandas.tests.conftest import TestConf as tm -from ibis.backends.pandas.udf import udf - - -@pytest.fixture -def df(): - return pd.DataFrame( - { - "a": list("abc"), - "b": [1, 2, 3], - "c": [4.0, 5.0, 6.0], - "key": list("aab"), - } - ) - - -@pytest.fixture -def df2(): - return pd.DataFrame( - { - "a": np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - "b": np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - "c": np.arange(7, dtype=int).tolist(), - "key": list("ddeefff"), - } - ) - - -@pytest.fixture -def con(df, df2): - return Backend().connect({"df": df, "df2": df2}) - - -@pytest.fixture -def t(con): - return con.table("df") - - -@pytest.fixture -def t2(con): - return con.table("df2") - - -with pytest.warns(FutureWarning, match="v9.0"): - - @udf.elementwise(input_type=["string"], output_type="int64") - def my_string_length(series, **kwargs): - return series.str.len() * 2 - - @udf.elementwise(input_type=[dt.double, dt.double], output_type=dt.double) - def my_add(series1, series2, **kwargs): - return series1 + series2 - - @udf.reduction(["double"], "double") - def my_mean(series): - return series.mean() - - @udf.reduction(input_type=[dt.string], output_type=dt.int64) - def my_string_length_sum(series, **kwargs): - return (series.str.len() * 2).sum() - - @udf.reduction(input_type=[dt.double, dt.double], output_type=dt.double) - def my_corr(lhs, rhs, **kwargs): - return lhs.corr(rhs) - - @udf.elementwise([dt.double], dt.double) - def add_one(x): - return x + 1.0 - - @udf.elementwise([dt.double], dt.double) - def times_two(x): - return x * 2.0 - - @udf.analytic(input_type=["double"], output_type="double") - def zscore(series): - return (series - series.mean()) / series.std() - - @udf.reduction( - input_type=[dt.double], - output_type=dt.Array(dt.double), - ) - def quantiles(series, *, quantiles): - return np.array(series.quantile(quantiles)) - - -def test_udf(t, df): - expr = my_string_length(t.a) - - assert isinstance(expr, ir.Column) - - result = expr.execute() - expected = df.a.str.len().mul(2) - tm.assert_series_equal(result, expected) - - -def test_multiple_argument_udf(con, t, df): - expr = my_add(t.b, t.c) - - assert isinstance(expr, ir.Column) - assert isinstance(expr, ir.NumericColumn) - assert isinstance(expr, ir.FloatingColumn) - - result = expr.execute() - expected = df.b + df.c - tm.assert_series_equal(result, expected) - - -def test_multiple_argument_udf_group_by(con, t, df): - expr = t.group_by(t.key).aggregate(my_add=my_add(t.b, t.c).sum()) - - assert isinstance(expr, ir.Table) - assert isinstance(expr.my_add, ir.Column) - assert isinstance(expr.my_add, ir.NumericColumn) - assert isinstance(expr.my_add, ir.FloatingColumn) - - result = expr.execute() - expected = pd.DataFrame( - {"key": list("ab"), "my_add": [sum([1.0 + 4.0, 2.0 + 5.0]), 3.0 + 6.0]} - ) - tm.assert_frame_equal(result, expected) - - -def test_udaf(con, t, df): - expr = my_string_length_sum(t.a) - - assert isinstance(expr, ir.Scalar) - - result = expr.execute() - expected = t.a.execute().str.len().mul(2).sum() - assert result == expected - - -def test_udaf_analytic(con, t, df): - expr = zscore(t.c) - - assert isinstance(expr, ir.Column) - - result = expr.execute() - - def f(s): - return s.sub(s.mean()).div(s.std()) - - expected = f(df.c) - tm.assert_series_equal(result, expected) - - -def test_udaf_analytic_groupby(con, t, df): - expr = zscore(t.c).over(ibis.window(group_by=t.key)) - - assert isinstance(expr, ir.Column) - - result = expr.execute() - - def f(s): - return s.sub(s.mean()).div(s.std()) - - expected = df.groupby("key").c.transform(f) - expected.name = None - tm.assert_series_equal(result, expected) - - -def test_udaf_groupby(): - df = pd.DataFrame( - { - "a": np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - "b": np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - "key": list("ddeefff"), - } - ) - con = Backend().connect({"df": df}) - t = con.table("df") - - expr = t.group_by(t.key).aggregate(my_corr=my_corr(t.a, t.b)) - - assert isinstance(expr, ir.Table) - - result = expr.execute().sort_values("key") - - dfi = df.set_index("key") - expected = pd.DataFrame( - { - "key": list("def"), - "my_corr": [ - dfi.loc[value, "a"].corr(dfi.loc[value, "b"]) for value in "def" - ], - } - ) - - columns = ["key", "my_corr"] - tm.assert_frame_equal(result[columns], expected[columns]) - - -def test_udaf_parameter_mismatch(): - with pytest.raises(TypeError): - with pytest.warns(FutureWarning, match="v9.0"): - - @udf.reduction(input_type=[dt.double], output_type=dt.double) - def my_corr(lhs, rhs, **kwargs): - pass - - -def test_udf_parameter_mismatch(): - with pytest.raises(TypeError): - with pytest.warns(FutureWarning, match="v9.0"): - - @udf.reduction(input_type=[], output_type=dt.double) - def my_corr2(lhs, **kwargs): - pass - - -def test_udf_error(t): - with pytest.warns(FutureWarning, match="v9.0"): - - @udf.elementwise(input_type=[dt.double], output_type=dt.double) - def error_udf(s): - raise ValueError("xxx") - - with pytest.raises(ValueError): - error_udf(t.c).execute() - - -def test_udf_no_reexecution(t2): - execution_count = 0 - with pytest.warns(FutureWarning, match="v9.0"): - - @udf.elementwise(input_type=[dt.double], output_type=dt.double) - def times_two_count_executions(x): - nonlocal execution_count - execution_count += 1 - return x * 2.0 - - expr = t2.mutate(doubled=times_two_count_executions(t2.a)) - expr.execute() - - assert execution_count == 1 - - -def test_compose_udfs(t2, df2): - expr = times_two(add_one(t2.a)) - result = expr.execute() - expected = df2.a.add(1.0).mul(2.0) - tm.assert_series_equal(expected, result) - - -def test_udaf_window(t2, df2): - window = ibis.trailing_window(2, order_by="a", group_by="key") - expr = t2.mutate(rolled=my_mean(t2.b).over(window)) - result = expr.execute().sort_values(["key", "a"]) - expected = df2.sort_values(["key", "a"]).assign( - rolled=lambda df: df.groupby("key") - .b.rolling(3, min_periods=1) - .mean() - .reset_index(level=0, drop=True) - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.xfail( - condition=vparse("1.4") <= vparse(pd.__version__) < vparse("1.4.2"), - raises=ValueError, - reason="https://github.com/pandas-dev/pandas/pull/44068", -) -def test_udaf_window_interval(): - df = pd.DataFrame( - collections.OrderedDict( - [ - ( - "time", - pd.date_range(start="20190105", end="20190101", freq="-1D"), - ), - ("key", [1, 2, 1, 2, 1]), - ("value", np.arange(5)), - ] - ) - ) - - con = Backend().connect({"df": df}) - t = con.table("df") - window = ibis.trailing_range_window( - ibis.interval(days=2), order_by="time", group_by="key" - ) - - expr = t.mutate(rolled=my_mean(t.value).over(window)) - - result = expr.execute().sort_values(["time", "key"]).reset_index(drop=True) - expected = ( - df.sort_values(["time", "key"]) - .set_index("time") - .assign( - rolled=lambda df: df.groupby("key") - .value.rolling("2D", closed="both") - .mean() - .reset_index(level=0, drop=True) - ) - ).reset_index(drop=False) - - tm.assert_frame_equal(result, expected) - - -def test_multiple_argument_udaf_window(): - # PR 2035 - with pytest.warns(FutureWarning, match="v9.0"): - - @udf.reduction(["double", "double"], "double") - def my_wm(v, w): - return np.average(v, weights=w) - - df = pd.DataFrame( - { - "a": np.arange(4, 0, dtype=float, step=-1).tolist() - + np.random.rand(3).tolist(), - "b": np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - "c": np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - "d": np.repeat(1, 7), - "key": list("deefefd"), - } - ) - con = Backend().connect({"df": df}) - t = con.table("df") - window = ibis.trailing_window(2, order_by="a", group_by="key") - window2 = ibis.trailing_window(1, order_by="b", group_by="key") - expr = t.mutate( - wm_b=my_wm(t.b, t.d).over(window), - wm_c=my_wm(t.c, t.d).over(window), - wm_c2=my_wm(t.c, t.d).over(window2), - ) - result = expr.execute().sort_values(["key", "a"]) - expected = ( - df.sort_values(["key", "a"]) - .assign( - wm_b=lambda df: df.groupby("key") - .b.rolling(3, min_periods=1) - .mean() - .reset_index(level=0, drop=True) - ) - .assign( - wm_c=lambda df: df.groupby("key") - .c.rolling(3, min_periods=1) - .mean() - .reset_index(level=0, drop=True) - ) - ) - expected = expected.sort_values(["key", "b"]).assign( - wm_c2=lambda df: df.groupby("key") - .c.rolling(2, min_periods=1) - .mean() - .reset_index(level=0, drop=True) - ) - expected = expected.sort_values(["key", "a"]) - - tm.assert_frame_equal(result, expected) - - -@pytest.fixture(params=[[0.25, 0.75], [0.01, 0.99]]) -def qs(request): - return request.param - - -def test_array_return_type_reduction(con, t, df, qs): - """Tests reduction UDF returning an array.""" - expr = quantiles(t.b, quantiles=qs) - result = expr.execute() - expected = np.array(df.b.quantile(qs)) - np.testing.assert_array_equal(result, expected) - - -def test_array_return_type_reduction_window(con, t, df, qs): - """Tests reduction UDF returning an array, used over a window.""" - expr = quantiles(t.b, quantiles=qs).over(ibis.window()) - result = expr.execute() - expected_raw = df.b.quantile(qs).tolist() - expected = pd.Series([expected_raw] * len(df)) - tm.assert_series_equal(result, expected) - - -def test_array_return_type_reduction_group_by(con, t, df, qs): - """Tests reduction UDF returning an array, used in a grouped aggregation. - - Getting this use case to succeed required avoiding use of - `SeriesGroupBy.agg` in the `Summarize` aggcontext implementation - (#2768). - """ - expr = t.group_by(t.key).aggregate(quantiles_col=quantiles(t.b, quantiles=qs)) - result = expr.execute() - - expected_col = df.groupby(df.key).b.agg(lambda s: s.quantile(qs).tolist()) - expected = pd.DataFrame({"quantiles_col": expected_col}).reset_index() - - tm.assert_frame_equal(result, expected) - - -def test_elementwise_udf_with_many_args(t2): - with pytest.warns(FutureWarning, match="v9.0"): - - @udf.elementwise( - input_type=[dt.double] * 16 + [dt.int32] * 8, output_type=dt.double - ) - def my_udf( - c1, - c2, - c3, - c4, - c5, - c6, - c7, - c8, - c9, - c10, - c11, - c12, - c13, - c14, - c15, - c16, - c17, - c18, - c19, - c20, - c21, - c22, - c23, - c24, - ): - return c1 - - expr = my_udf(*([t2.a] * 8 + [t2.b] * 8 + [t2.c] * 8)) - result = expr.execute() - expected = t2.a.execute() - - tm.assert_series_equal(result, expected) diff --git a/ibis/backends/pandas/tests/test_window.py b/ibis/backends/pandas/tests/test_window.py deleted file mode 100644 index a0cf0f4e3eed0..0000000000000 --- a/ibis/backends/pandas/tests/test_window.py +++ /dev/null @@ -1,641 +0,0 @@ -from __future__ import annotations - -import io -from datetime import date -from operator import methodcaller - -import numpy as np -import pandas as pd -import pytest - -import ibis -import ibis.expr.datatypes as dt -from ibis.backends.pandas import Backend -from ibis.backends.pandas.tests.conftest import TestConf as tm -from ibis.legacy.udf.vectorized import reduction - - -@pytest.fixture(scope="session") -def sort_kind(): - return "mergesort" - - -default = pytest.mark.parametrize("default", [ibis.null(), ibis.literal("a")]) -row_offset = pytest.mark.parametrize("row_offset", list(map(ibis.literal, [-1, 1, 0]))) -range_offset = pytest.mark.parametrize( - "range_offset", - [ - ibis.interval(days=1), - 2 * ibis.interval(days=1), - -2 * ibis.interval(days=1), - ], -) - - -@pytest.fixture -def row_window(): - return ibis.window(following=0, order_by="plain_int64") - - -@pytest.fixture -def range_window(): - return ibis.window(following=0, order_by="plain_datetimes_naive") - - -@default -@row_offset -def test_lead(t, df, row_offset, default, row_window): - con = ibis.pandas.connect() - expr = t.dup_strings.lead(row_offset, default=default).over(row_window) - result = expr.execute() - expected = df.dup_strings.shift(con.execute(-row_offset)) - if default is not ibis.null(): - expected = expected.fillna(con.execute(default)) - tm.assert_series_equal(result, expected.rename("tmp")) - - -@default -@row_offset -def test_lag(t, df, row_offset, default, row_window): - con = ibis.pandas.connect() - expr = t.dup_strings.lag(row_offset, default=default).over(row_window) - result = expr.execute() - expected = df.dup_strings.shift(con.execute(row_offset)) - if default is not ibis.null(): - expected = expected.fillna(con.execute(default)) - tm.assert_series_equal(result, expected.rename("tmp")) - - -@default -@range_offset -def test_lead_delta(t, df, range_offset, default, range_window): - con = ibis.pandas.connect() - expr = t.dup_strings.lead(range_offset, default=default).over(range_window) - result = expr.execute() - expected = ( - df[["plain_datetimes_naive", "dup_strings"]] - .set_index("plain_datetimes_naive") - .squeeze() - .shift(freq=con.execute(-range_offset)) - .reindex(df.plain_datetimes_naive) - .reset_index(drop=True) - ) - if default is not ibis.null(): - expected = expected.fillna(con.execute(default)) - tm.assert_series_equal(result, expected.rename("tmp")) - - -@default -@range_offset -def test_lag_delta(t, df, range_offset, default, range_window): - con = ibis.pandas.connect() - expr = t.dup_strings.lag(range_offset, default=default).over(range_window) - result = expr.execute() - - expected = ( - df[["plain_datetimes_naive", "dup_strings"]] - .set_index("plain_datetimes_naive") - .squeeze() - .shift(freq=con.execute(range_offset)) - .reindex(df.plain_datetimes_naive) - .reset_index(drop=True) - ) - if default is not ibis.null(): - expected = expected.fillna(con.execute(default)) - tm.assert_series_equal(result, expected.rename("tmp")) - - -def test_first(t, df): - expr = t.dup_strings.first() - result = expr.execute() - expected = df.dup_strings.iat[0] - assert result == expected - - -def test_last(t, df): - expr = t.dup_strings.last() - result = expr.execute() - expected = df.dup_strings.iat[-1] - assert result == expected - - -def test_first_and_last_over_window(t): - def simple_window_ops(table): - w = ibis.window( - order_by=[table.plain_uint64, table.dup_ints], - preceding=1, - following=0, - ) - return table.mutate( - x_first=t.plain_uint64.first().over(w), - x_last=t.plain_uint64.last().over(w), - y_first=t.dup_ints.first().over(w), - y_last=t.dup_ints.last().over(w), - ) - - assert simple_window_ops(t).execute() is not None - - -def test_group_by_mutate_analytic(t, df): - gb = t.group_by(t.dup_strings) - expr = gb.mutate( - first_value=t.plain_int64.first(), - last_value=t.plain_strings.last(), - avg_broadcast=t.plain_float64 - t.plain_float64.mean(), - delta=(t.plain_int64 - t.plain_int64.lag()) - / (t.plain_float64 - t.plain_float64.lag()), - ) - result = expr.execute() - - gb = df.groupby("dup_strings") - expected = df.assign( - last_value=gb.plain_strings.transform("last"), - first_value=gb.plain_int64.transform("first"), - avg_broadcast=df.plain_float64 - gb.plain_float64.transform("mean"), - delta=( - (df.plain_int64 - gb.plain_int64.shift(1)) - / (df.plain_float64 - gb.plain_float64.shift(1)) - ), - ) - - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_players(players, players_df): - lagged = players.mutate(pct=lambda t: t.G - t.G.lag()) - expected = players_df.assign( - pct=players_df.G - players_df.groupby("playerID").G.shift(1) - ) - cols = expected.columns.tolist() - result = lagged.execute()[cols].sort_values(cols).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -def test_batting_filter_mean(batting, batting_df): - expr = batting.filter(batting.G > batting.G.mean()) - result = expr.execute() - expected = batting_df[batting_df.G > batting_df.G.mean()].reset_index(drop=True) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_batting_zscore(players, players_df): - expr = players.mutate(g_z=lambda t: (t.G - t.G.mean()) / t.G.std()) - - gb = players_df.groupby("playerID") - expected = players_df.assign( - g_z=(players_df.G - gb.G.transform("mean")) / gb.G.transform("std") - ) - cols = expected.columns.tolist() - result = expr.execute()[cols].sort_values(cols).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -def test_batting_avg_change_in_games_per_year(players, players_df): - expr = players.mutate( - delta=lambda t: (t.G - t.G.lag()) / (t.yearID - t.yearID.lag()) - ) - - gb = players_df.groupby("playerID") - expected = players_df.assign( - delta=(players_df.G - gb.G.shift(1)) / (players_df.yearID - gb.yearID.shift(1)) - ) - - cols = expected.columns.tolist() - result = expr.execute()[cols].sort_values(cols).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.xfail(raises=AssertionError, reason="NYI") -def test_batting_most_hits(players, players_df): - expr = players.mutate( - hits_rank=lambda t: t.H.rank().over( - ibis.cumulative_window(order_by=ibis.desc(t.H)) - ) - ) - result = expr.execute() - hits_rank = players_df.groupby("playerID").H.rank(method="min", ascending=False) - expected = players_df.assign(hits_rank=hits_rank) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_batting_quantile(players, players_df): - expr = players.mutate(hits_quantile=lambda t: t.H.quantile(0.25)) - hits_quantile = players_df.groupby("playerID").H.transform("quantile", 0.25) - expected = players_df.assign(hits_quantile=hits_quantile) - cols = expected.columns.tolist() - result = expr.execute()[cols].sort_values(cols).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -def test_batting_approx_median(players, players_df): - expr = players.mutate(hits_median=lambda t: t.H.approx_median()) - hits_median = players_df.groupby("playerID").H.transform("median") - expected = players_df.assign(hits_median=hits_median) - cols = expected.columns.tolist() - result = expr.execute()[cols].sort_values(cols).reset_index(drop=True) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("op", ["sum", "mean", "min", "max"]) -def test_batting_specific_cumulative(batting, batting_df, op, sort_kind): - ibis_method = methodcaller(f"cum{op}", order_by=batting.yearID) - expr = ibis_method(batting.G) - result = expr.execute().astype("float64") - - pandas_method = methodcaller(op) - expected = pandas_method( - batting_df[["G", "yearID"]].sort_values("yearID", kind=sort_kind).G.expanding() - ).reset_index(drop=True) - tm.assert_series_equal(result, expected.rename("tmp")) - - -def test_batting_cumulative(batting, batting_df, sort_kind): - expr = batting.mutate( - more_values=lambda t: t.G.sum().over(ibis.cumulative_window(order_by=t.yearID)) - ) - result = expr.execute() - - columns = ["G", "yearID"] - more_values = ( - batting_df[columns] - .sort_values("yearID", kind=sort_kind) - .G.expanding() - .sum() - .astype("int64") - ) - expected = batting_df.assign(more_values=more_values) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_batting_cumulative_partitioned(batting, batting_df, sort_kind): - group_by = "playerID" - order_by = "yearID" - - t = batting - expr = t.G.sum().over(ibis.cumulative_window(order_by=order_by, group_by=group_by)) - expr = t.mutate(cumulative=expr) - result = expr.execute() - - columns = [group_by, order_by, "G"] - expected = ( - batting_df[columns] - .set_index(order_by) - .groupby(group_by) - .G.expanding() - .sum() - .rename("cumulative") - ) - - tm.assert_series_equal( - result.set_index([group_by, order_by]).sort_index().cumulative, - expected.sort_index().astype("int64"), - ) - - -def test_batting_rolling(batting, batting_df, sort_kind): - expr = batting.mutate( - more_values=lambda t: t.G.sum().over(ibis.trailing_window(5, order_by=t.yearID)) - ) - result = expr.execute() - - columns = ["G", "yearID"] - more_values = ( - batting_df[columns] - .sort_values("yearID", kind=sort_kind) - .G.rolling(6, min_periods=1) - .sum() - .astype("int64") - ) - expected = batting_df.assign(more_values=more_values) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_batting_rolling_partitioned(batting, batting_df, sort_kind): - t = batting - group_by = "playerID" - order_by = "yearID" - expr = t.G.sum().over( - ibis.trailing_window(3, order_by=t[order_by], group_by=t[group_by]) - ) - expr = t.mutate(rolled=expr) - result = expr.execute() - - columns = [group_by, order_by, "G"] - expected = ( - batting_df[columns] - .set_index(order_by) - .groupby(group_by) - .G.rolling(4, min_periods=1) - .sum() - .rename("rolled") - ) - - tm.assert_series_equal( - result.set_index([group_by, order_by]).sort_index().rolled, - expected.sort_index().astype("int64"), - ) - - -def test_scalar_broadcasting(batting, batting_df): - expr = batting.mutate(demeaned=batting.G - batting.G.mean()) - result = expr.execute() - expected = batting_df.assign(demeaned=batting_df.G - batting_df.G.mean()) - tm.assert_frame_equal(result, expected) - - -def test_mutate_with_window_after_join(sort_kind): - left_df = pd.DataFrame( - { - "ints": [0, 1, 2], - "strings": ["a", "b", "c"], - "dates": pd.date_range("20170101", periods=3), - } - ) - right_df = pd.DataFrame( - { - "group": [0, 1, 2] * 3, - "value": [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], - } - ) - con = Backend().connect({"left": left_df, "right": right_df}) - left, right = map(con.table, ("left", "right")) - - joined = left.outer_join(right, left.ints == right.group) - proj = joined.select(left, right.value) - expr = proj.group_by("ints").mutate(sum=proj.value.sum()) - result = expr.execute() - expected = pd.DataFrame( - { - "dates": pd.concat([left_df.dates] * 3) - .sort_values(kind=sort_kind) - .reset_index(drop=True), - "ints": [0] * 3 + [1] * 3 + [2] * 3, - "strings": ["a"] * 3 + ["b"] * 3 + ["c"] * 3, - "value": [0.0, 3.0, 6.0, 1.0, 4.0, 7.0, np.nan, np.nan, 8.0], - "sum": [9.0] * 3 + [12.0] * 3 + [8.0] * 3, - } - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_mutate_scalar_with_window_after_join(): - left_df = pd.DataFrame({"ints": range(3)}) - right_df = pd.DataFrame( - { - "group": [0, 1, 2] * 3, - "value": [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], - } - ) - con = Backend().connect({"left": left_df, "right": right_df}) - left, right = map(con.table, ("left", "right")) - - joined = left.outer_join(right, left.ints == right.group) - proj = joined.select(left, right.value) - expr = proj.mutate(sum=proj.value.sum(), const=ibis.literal(1)) - result = expr.execute() - expected = pd.DataFrame( - { - "ints": [0] * 3 + [1] * 3 + [2] * 3, - "value": [0.0, 3.0, 6.0, 1.0, 4.0, 7.0, np.nan, np.nan, 8.0], - "sum": [29.0] * 9, - "const": np.ones(9, dtype="int8"), - } - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_project_scalar_after_join(): - left_df = pd.DataFrame({"ints": range(3)}) - right_df = pd.DataFrame( - { - "group": [0, 1, 2] * 3, - "value": [0, 1, np.nan, 3, 4, np.nan, 6, 7, 8], - } - ) - con = ibis.pandas.connect({"left": left_df, "right": right_df}) - left, right = map(con.table, ("left", "right")) - - joined = left.outer_join(right, left.ints == right.group) - proj = joined.select(left, right.value) - expr = proj.select(proj.value.sum().name("sum"), ibis.literal(1).name("const")) - result = expr.execute() - expected = pd.DataFrame( - { - "sum": [29.0] * 9, - "const": np.ones(9, dtype="int8"), - } - ) - tm.assert_frame_equal(result[expected.columns], expected) - - -def test_project_list_scalar(): - df = pd.DataFrame({"ints": range(3)}) - con = ibis.pandas.connect({"df": df}) - table = con.table("df") - expr = table.mutate(res=table.ints.quantile([0.5, 0.95])) - result = expr.execute() - - expected = pd.Series([[1.0, 1.9] for _ in range(3)], name="res") - tm.assert_series_equal(result.res, expected) - - -@pytest.mark.parametrize( - "index", - [ - pytest.param(lambda time: None, id="no_index"), - pytest.param(lambda time: time, id="index"), - ], -) -def test_window_with_preceding_expr(index): - time = pd.date_range("20180101", "20180110") - start = 2 - data = np.arange(start, start + len(time)) - df = pd.DataFrame({"value": data, "time": time}, index=index(time)) - client = ibis.pandas.connect({"df": df}) - t = client.table("df") - expected = ( - df.set_index("time") - .value.rolling("3d", closed="both") - .mean() - .reset_index(drop=True) - ) - expected.index.name = None - day = ibis.interval(days=1) - window = ibis.trailing_window(3 * day, order_by=t.time) - expr = t.value.mean().over(window) - result = expr.execute() - tm.assert_series_equal(result, expected.rename("mean")) - - -def test_window_grouping_key_has_scope(t, df): - param = ibis.param(dt.string) - window = ibis.window(group_by=t.dup_strings + param) - expr = t.plain_int64.mean().over(window) - result = expr.execute(params={param: "a"}) - expected = df.groupby(df.dup_strings + "a").plain_int64.transform("mean") - tm.assert_series_equal(result, expected.rename("mean")) - - -def test_window_on_and_by_key_as_window_input(t, df): - order_by = "plain_int64" - group_by = "dup_ints" - control = "plain_float64" - - row_window = ibis.trailing_window(order_by=order_by, group_by=group_by, preceding=1) - - # Test built-in function - - tm.assert_series_equal( - t[order_by].count().over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - tm.assert_series_equal( - t[group_by].count().over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - # Test UDF - with pytest.warns(FutureWarning, match="v9.0"): - - @reduction(input_type=[dt.int64], output_type=dt.int64) - def count(v): - return len(v) - - @reduction(input_type=[dt.int64, dt.int64], output_type=dt.int64) - def count_both(v1, v2): - return len(v1) - - tm.assert_series_equal( - count(t[order_by]).over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - tm.assert_series_equal( - count(t[group_by]).over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - tm.assert_series_equal( - count_both(t[group_by], t[order_by]).over(row_window).execute(), - t[control].count().over(row_window).execute(), - check_names=False, - ) - - -@pytest.mark.parametrize( - "group_by,order_by", - [ - (None, None), - # Enable this after #2395 is merged - # (None, 'plain_datetimes_utc'), - ("dup_ints", None), - ("dup_ints", "plain_datetimes_utc"), - ], -) -def test_rolling_window_udf_nan_and_non_numeric(t, group_by, order_by): - # Test that rolling window can be performed on - # (1) A column that contains NaN values - # (2) A non-numeric column - # (3) A non-numeric column that contains NaN value - - t = t.mutate(nan_int64=t["plain_int64"]) - t = t.mutate(nan_int64=None) - - with pytest.warns(FutureWarning, match="v9.0"): - - @reduction(input_type=[dt.int64], output_type=dt.int64) - def count_int64(v): - return len(v) - - @reduction(input_type=[dt.timestamp], output_type=dt.int64) - def count_timestamp(v): - return len(v) - - @reduction( - input_type=[t["map_of_strings_integers"].type()], output_type=dt.int64 - ) - def count_complex(v): - return len(v) - - window = ibis.trailing_window(preceding=1, order_by=order_by, group_by=group_by) - - result_nan = count_int64(t["nan_int64"]).over(window).execute() - result_non_numeric = ( - count_timestamp(t["plain_datetimes_utc"]).over(window).execute() - ) - result_nan_non_numeric = ( - count_timestamp(t["map_of_strings_integers"]).over(window).execute() - ) - expected = t["plain_int64"].count().over(window).execute() - - tm.assert_series_equal(result_nan, expected, check_names=False) - tm.assert_series_equal(result_non_numeric, expected, check_names=False) - tm.assert_series_equal(result_nan_non_numeric, expected, check_names=False) - - -@pytest.fixture -def events(): - df = pd.DataFrame( - { - "event_id": [1] * 4 + [2] * 6 + [3] * 2, - "measured_on": map( - pd.Timestamp, - map( - date, - [2021] * 12, - [6] * 4 + [5] * 6 + [7] * 2, - range(1, 13), - ), - ), - "measurement": np.nan, - } - ) - df.at[1, "measurement"] = 5.0 - df.at[4, "measurement"] = 42.0 - df.at[5, "measurement"] = 42.0 - df.at[7, "measurement"] = 11.0 - return df - - -def test_bfill(events): - con = ibis.pandas.connect({"t": events}) - t = con.table("t") - - win = ibis.window( - group_by=t.event_id, order_by=ibis.desc(t.measured_on), following=0 - ) - grouped = t.mutate(grouper=t.measurement.count().over(win)) - - expr = ( - grouped.group_by([grouped.event_id, grouped.grouper]) - .mutate(bfill=grouped.measurement.max()) - .order_by("measured_on") - ) - result = expr.execute().reset_index(drop=True) - - expected_raw = """\ -event_id measured_on measurement grouper bfill - 2 2021-05-05 42.0 3 42.0 - 2 2021-05-06 42.0 2 42.0 - 2 2021-05-07 NaN 1 11.0 - 2 2021-05-08 11.0 1 11.0 - 2 2021-05-09 NaN 0 NaN - 2 2021-05-10 NaN 0 NaN - 1 2021-06-01 NaN 1 5.0 - 1 2021-06-02 5.0 1 5.0 - 1 2021-06-03 NaN 0 NaN - 1 2021-06-04 NaN 0 NaN - 3 2021-07-11 NaN 0 NaN - 3 2021-07-12 NaN 0 NaN""" - expected = pd.read_csv( - io.StringIO(expected_raw), - sep=r"\s+", - header=0, - parse_dates=["measured_on"], - ) - tm.assert_frame_equal(result, expected) diff --git a/ibis/backends/pandas/udf.py b/ibis/backends/pandas/udf.py deleted file mode 100644 index 3168d348f67d5..0000000000000 --- a/ibis/backends/pandas/udf.py +++ /dev/null @@ -1,23 +0,0 @@ -"""APIs for creating user-defined functions.""" - -from __future__ import annotations - -import ibis.legacy.udf.vectorized - - -class udf: - @staticmethod - def elementwise(input_type, output_type): - """Alias for ibis.legacy.udf.vectorized.elementwise.""" - - return ibis.legacy.udf.vectorized.elementwise(input_type, output_type) - - @staticmethod - def reduction(input_type, output_type): - """Alias for ibis.legacy.udf.vectorized.reduction.""" - return ibis.legacy.udf.vectorized.reduction(input_type, output_type) - - @staticmethod - def analytic(input_type, output_type): - """Alias for ibis.legacy.udf.vectorized.analytic.""" - return ibis.legacy.udf.vectorized.analytic(input_type, output_type) diff --git a/ibis/backends/polars/__init__.py b/ibis/backends/polars/__init__.py index 6e19290dec467..e0461ac8202bf 100644 --- a/ibis/backends/polars/__init__.py +++ b/ibis/backends/polars/__init__.py @@ -13,15 +13,11 @@ import ibis.expr.schema as sch import ibis.expr.types as ir from ibis.backends import BaseBackend, NoUrl -from ibis.backends.pandas.rewrites import ( - bind_unbound_table, - replace_parameter, - rewrite_join, -) from ibis.backends.polars.compiler import translate +from ibis.backends.polars.rewrites import bind_unbound_table, rewrite_join from ibis.backends.sql.dialects import Polars from ibis.common.dispatch import lazy_singledispatch -from ibis.expr.rewrites import lower_stringslice +from ibis.expr.rewrites import lower_stringslice, replace_parameter from ibis.formats.polars import PolarsSchema from ibis.util import deprecated, gen_name, normalize_filename, normalize_filenames diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py index 907ced2525829..eb4d9cb532328 100644 --- a/ibis/backends/polars/compiler.py +++ b/ibis/backends/polars/compiler.py @@ -14,7 +14,7 @@ import ibis.common.exceptions as com import ibis.expr.datatypes as dt import ibis.expr.operations as ops -from ibis.backends.pandas.rewrites import PandasAsofJoin, PandasJoin, PandasRename +from ibis.backends.polars.rewrites import PandasAsofJoin, PandasJoin, PandasRename from ibis.backends.sql.compilers.base import STAR from ibis.backends.sql.dialects import Polars from ibis.expr.operations.udf import InputType diff --git a/ibis/backends/polars/rewrites.py b/ibis/backends/polars/rewrites.py new file mode 100644 index 0000000000000..24768b80fd617 --- /dev/null +++ b/ibis/backends/polars/rewrites.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from public import public + +import ibis.expr.datatypes as dt +import ibis.expr.operations as ops +from ibis.common.annotations import attribute +from ibis.common.collections import FrozenDict +from ibis.common.patterns import replace +from ibis.common.typing import VarTuple # noqa: TCH001 +from ibis.expr.schema import Schema + + +@public +class PandasRename(ops.Relation): + parent: ops.Relation + mapping: FrozenDict[str, str] + + @classmethod + def from_prefix(cls, parent, prefix): + mapping = {k: f"{prefix}_{k}" for k in parent.schema} + return cls(parent, mapping) + + @attribute + def values(self): + return FrozenDict( + {to: ops.Field(self.parent, from_) for from_, to in self.mapping.items()} + ) + + @attribute + def schema(self): + return Schema( + {self.mapping[name]: dtype for name, dtype in self.parent.schema.items()} + ) + + +@public +class PandasJoin(ops.Relation): + left: ops.Relation + right: ops.Relation + left_on: VarTuple[ops.Value] + right_on: VarTuple[ops.Value] + how: str + + @attribute + def values(self): + return FrozenDict({**self.left.values, **self.right.values}) + + @attribute + def schema(self): + return self.left.schema | self.right.schema + + +@public +class PandasAsofJoin(PandasJoin): + left_by: VarTuple[ops.Value] + right_by: VarTuple[ops.Value] + operator: type + + +def split_join_predicates(left, right, predicates, only_equality=True): + left_on = [] + right_on = [] + for pred in predicates: + if left not in pred.relations or right not in pred.relations: + # not a usual join predicate, so apply a trick by placing the + # predicate to the left side and adding a literal True to the right + # which the left side must be equal to + left_on.append(pred) + right_on.append(ops.Literal(True, dtype=dt.boolean)) + elif isinstance(pred, ops.Binary): + if only_equality and not isinstance(pred, ops.Equals): + raise TypeError("Only equality join predicates supported with pandas") + if left in pred.left.relations and right in pred.right.relations: + left_on.append(pred.left) + right_on.append(pred.right) + elif left in pred.right.relations and right in pred.left.relations: + left_on.append(pred.right) + right_on.append(pred.left) + else: + raise ValueError("Join predicate does not reference both tables") + else: + raise TypeError(f"Unsupported join predicate {pred}") + + return left_on, right_on + + +@replace(ops.JoinChain) +def rewrite_join(_, **kwargs): + # TODO(kszucs): JoinTable.index can be used as a prefix + prefixes = {} + prefixes[_.first] = prefix = str(len(prefixes)) + left = PandasRename.from_prefix(_.first, prefix) + + for link in _.rest: + prefixes[link.table] = prefix = str(len(prefixes)) + right = PandasRename.from_prefix(link.table, prefix) + + subs = {v: ops.Field(left, k) for k, v in left.values.items()} + subs.update({v: ops.Field(right, k) for k, v in right.values.items()}) + preds = [pred.replace(subs, filter=ops.Value) for pred in link.predicates] + + # separate ASOF from the rest of the joins + if link.how == "asof": + on, *by = preds + left_on, right_on = split_join_predicates( + left, right, [on], only_equality=False + ) + left_by, right_by = split_join_predicates(left, right, by) + left = PandasAsofJoin( + how="asof", + left=left, + right=right, + left_on=left_on, + right_on=right_on, + left_by=left_by, + right_by=right_by, + operator=type(on), + ) + else: + # need to replace the fields in the predicates + left_on, right_on = split_join_predicates(left, right, preds) + left = PandasJoin( + how=link.how, + left=left, + right=right, + left_on=left_on, + right_on=right_on, + ) + + subs = {v: ops.Field(left, k) for k, v in left.values.items()} + fields = {k: v.replace(subs, filter=ops.Value) for k, v in _.values.items()} + return ops.Project(left, fields) + + +@replace(ops.UnboundTable) +def bind_unbound_table(_, backend, **kwargs): + return ops.DatabaseTable(name=_.name, schema=_.schema, source=backend) diff --git a/ibis/backends/tests/test_api.py b/ibis/backends/tests/test_api.py index 025b2c95eb91b..108a194ecb0c5 100644 --- a/ibis/backends/tests/test_api.py +++ b/ibis/backends/tests/test_api.py @@ -26,7 +26,6 @@ def test_version(backend): "sqlite", "datafusion", "exasol", - "pandas", "druid", "oracle", "bigquery", diff --git a/ibis/backends/tests/test_array.py b/ibis/backends/tests/test_array.py index 45f89862db94f..190d89a6c803b 100644 --- a/ibis/backends/tests/test_array.py +++ b/ibis/backends/tests/test_array.py @@ -351,11 +351,6 @@ def test_unnest_no_nulls(backend): @builtin_array -@pytest.mark.notimpl( - "pandas", - raises=ValueError, - reason="all the input arrays must have same number of dimensions", -) def test_unnest_default_name(backend): array_types = backend.array_types df = array_types.execute() @@ -423,11 +418,6 @@ def test_array_slice(backend, start, stop): raises=PsycoPg2InternalError, reason="TODO(Kexiang): seems a bug", ) -@pytest.mark.notimpl( - ["pandas"], - raises=com.OperationNotDefinedError, - reason="Operation 'ArrayMap' is not implemented for this backend", -) @pytest.mark.notimpl( ["sqlite"], raises=com.UnsupportedBackendType, reason="Unsupported type: Array: ..." ) @@ -473,12 +463,7 @@ def test_array_map(con, input, output, func): @builtin_array @pytest.mark.notimpl( - ["datafusion", "flink", "pandas", "polars"], raises=com.OperationNotDefinedError -) -@pytest.mark.notimpl( - ["pandas"], - raises=com.OperationNotDefinedError, - reason="Operation 'ArrayMap' is not implemented for this backend", + ["datafusion", "flink", "polars"], raises=com.OperationNotDefinedError ) @pytest.mark.notimpl( ["sqlite"], raises=com.UnsupportedBackendType, reason="Unsupported type: Array..." @@ -780,7 +765,7 @@ def test_array_union(con, a, b, expected_array): @builtin_array -@pytest.mark.notimpl(["pandas", "polars", "flink"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["polars", "flink"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl( ["sqlite"], raises=com.UnsupportedBackendType, reason="Unsupported type: Array..." ) @@ -868,16 +853,7 @@ def test_unnest_struct_with_multiple_fields(con): array_zip_notimpl = pytest.mark.notimpl( - [ - "datafusion", - "druid", - "oracle", - "pandas", - "polars", - "postgres", - "risingwave", - "flink", - ], + ["datafusion", "druid", "oracle", "polars", "postgres", "risingwave", "flink"], raises=com.OperationNotDefinedError, ) @@ -1141,7 +1117,7 @@ def test_unnest_empty_array(con): @builtin_array @pytest.mark.notimpl( - ["datafusion", "flink", "polars", "pandas"], raises=com.OperationNotDefinedError + ["datafusion", "flink", "polars"], raises=com.OperationNotDefinedError ) @pytest.mark.notimpl(["sqlite"], raises=com.UnsupportedBackendType) @pytest.mark.notyet( @@ -1161,7 +1137,7 @@ def test_array_map_with_conflicting_names(backend, con): @builtin_array @pytest.mark.notimpl( - ["datafusion", "flink", "polars", "sqlite", "pandas", "sqlite"], + ["datafusion", "flink", "polars", "sqlite", "sqlite"], raises=com.OperationNotDefinedError, ) def test_complex_array_map(con): @@ -1350,9 +1326,6 @@ def test_repr_timestamp_array(con, monkeypatch): ["datafusion", "flink", "polars"], raises=com.OperationNotDefinedError, ) -@pytest.mark.notimpl( - ["pandas"], raises=ValueError, reason="reindex on duplicate values" -) def test_unnest_range(con): expr = ibis.range(2).unnest().name("x").as_table().mutate({"y": 1.0}) result = con.execute(expr) @@ -1384,7 +1357,7 @@ def test_array_literal_with_exprs(con, input, expected): @pytest.mark.notimpl( - ["datafusion", "postgres", "pandas", "polars", "risingwave", "flink"], + ["datafusion", "postgres", "polars", "risingwave", "flink"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl( @@ -1404,7 +1377,7 @@ def test_zip_unnest_lift(con): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError + ["datafusion", "polars", "flink"], raises=com.OperationNotDefinedError ) @pytest.mark.parametrize( "colspec", @@ -1419,7 +1392,7 @@ def test_table_unnest(backend, colspec): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError + ["datafusion", "polars", "flink"], raises=com.OperationNotDefinedError ) def test_table_unnest_with_offset(backend): t = backend.array_types @@ -1444,7 +1417,7 @@ def test_table_unnest_with_offset(backend): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError + ["datafusion", "polars", "flink"], raises=com.OperationNotDefinedError ) def test_table_unnest_with_keep_empty(con): t = ibis.memtable(pd.DataFrame({"y": [[], None, ["a"]]})) @@ -1454,7 +1427,7 @@ def test_table_unnest_with_keep_empty(con): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError + ["datafusion", "polars", "flink"], raises=com.OperationNotDefinedError ) @pytest.mark.notyet( ["risingwave"], raises=PsycoPg2InternalError, reason="not supported in risingwave" @@ -1468,7 +1441,7 @@ def test_table_unnest_column_expr(backend): @pytest.mark.notimpl( - ["datafusion", "pandas", "polars", "flink"], raises=com.OperationNotDefinedError + ["datafusion", "polars", "flink"], raises=com.OperationNotDefinedError ) @pytest.mark.notimpl(["trino"], raises=TrinoUserError) @pytest.mark.notimpl(["postgres"], raises=PsycoPg2SyntaxError) @@ -1496,7 +1469,7 @@ def test_table_unnest_array_of_struct_of_array(con): notimpl_aggs = pytest.mark.notimpl( - ["datafusion", "flink", "pandas"], raises=com.OperationNotDefinedError + ["datafusion", "flink"], raises=com.OperationNotDefinedError ) diff --git a/ibis/backends/tests/test_client.py b/ibis/backends/tests/test_client.py index 4f84c965d895c..27575c67dcc7c 100644 --- a/ibis/backends/tests/test_client.py +++ b/ibis/backends/tests/test_client.py @@ -591,6 +591,7 @@ def _emp(a, b, c, d): reason="`insert` method not implemented", ) @pytest.mark.notyet(["druid"], raises=NotImplementedError) +@pytest.mark.notimpl(["polars"], raises=AttributeError) @pytest.mark.notimpl( ["flink"], raises=com.IbisError, diff --git a/ibis/backends/tests/test_column.py b/ibis/backends/tests/test_column.py index cb5231ce696cf..c60b6d4706540 100644 --- a/ibis/backends/tests/test_column.py +++ b/ibis/backends/tests/test_column.py @@ -15,7 +15,6 @@ "mssql", "mysql", "oracle", - "pandas", "polars", "postgres", "risingwave", diff --git a/ibis/backends/tests/test_dot_sql.py b/ibis/backends/tests/test_dot_sql.py index 232cd6bbd1f92..ad015cd329e53 100644 --- a/ibis/backends/tests/test_dot_sql.py +++ b/ibis/backends/tests/test_dot_sql.py @@ -22,8 +22,6 @@ pd = pytest.importorskip("pandas") tm = pytest.importorskip("pandas.testing") -dot_sql_never = pytest.mark.never(["pandas"], reason="pandas does not accept SQL") - _NAMES = { "bigquery": f"ibis_gbq_testing_{getpass.getuser()}_{PYTHON_SHORT_VERSION}.functional_alltypes", } @@ -41,7 +39,6 @@ def ftname(con, ftname_raw): return table.sql(con.dialect) -@dot_sql_never @pytest.mark.parametrize( "schema", [ @@ -90,7 +87,6 @@ def test_con_dot_sql(backend, con, schema, ftname): @pytest.mark.notyet( ["druid"], raises=com.IbisTypeError, reason="druid does not preserve case" ) -@dot_sql_never def test_table_dot_sql(backend): alltypes = backend.functional_alltypes t = ( @@ -125,7 +121,6 @@ def test_table_dot_sql(backend): assert pytest.approx(result) == expected -@dot_sql_never @pytest.mark.notyet( ["bigquery"], raises=GoogleBadRequest, reason="requires a qualified name" ) @@ -186,7 +181,6 @@ def test_table_dot_sql_with_join(backend): @pytest.mark.notyet( ["bigquery"], raises=GoogleBadRequest, reason="requires a qualified name" ) -@dot_sql_never def test_table_dot_sql_repr(backend): alltypes = backend.functional_alltypes t = ( @@ -211,7 +205,6 @@ def test_table_dot_sql_repr(backend): assert repr(t) -@dot_sql_never def test_dot_sql_alias_with_params(backend, alltypes, df): t = alltypes x = t.select(x=t.string_col + " abc").alias("foo") @@ -220,7 +213,6 @@ def test_dot_sql_alias_with_params(backend, alltypes, df): backend.assert_series_equal(result.x, expected) -@dot_sql_never def test_dot_sql_reuse_alias_with_different_types(backend, alltypes, df): foo1 = alltypes.select(x=alltypes.string_col).alias("foo") foo2 = alltypes.select(x=alltypes.bigint_col).alias("foo") @@ -230,15 +222,10 @@ def test_dot_sql_reuse_alias_with_different_types(backend, alltypes, df): backend.assert_series_equal(foo2.x.execute(), expected2) -_NO_SQLGLOT_DIALECT = ("pandas",) -no_sqlglot_dialect = [ - param(dialect, marks=pytest.mark.xfail) for dialect in sorted(_NO_SQLGLOT_DIALECT) -] -dialects = sorted(_get_backend_names(exclude=_NO_SQLGLOT_DIALECT)) + no_sqlglot_dialect +dialects = sorted(_get_backend_names()) @pytest.mark.parametrize("dialect", dialects) -@dot_sql_never @pytest.mark.notyet(["druid"], reason="druid doesn't respect column name case") def test_table_dot_sql_transpile(backend, alltypes, dialect, df): name = "foo2" @@ -256,7 +243,6 @@ def test_table_dot_sql_transpile(backend, alltypes, dialect, df): ["druid"], raises=AttributeError, reason="druid doesn't respect column names" ) @pytest.mark.notyet(["bigquery"]) -@dot_sql_never def test_con_dot_sql_transpile(backend, con, dialect, df): t = sg.table("functional_alltypes", quoted=True) foo = sg.select( @@ -269,7 +255,6 @@ def test_con_dot_sql_transpile(backend, con, dialect, df): backend.assert_series_equal(result.x, expected) -@dot_sql_never @pytest.mark.notimpl(["druid", "polars"]) def test_order_by_no_projection(backend): con = backend.connection @@ -283,7 +268,6 @@ def test_order_by_no_projection(backend): assert set(result) == {"Ross, Jerry L.", "Chang-Diaz, Franklin R."} -@dot_sql_never def test_dot_sql_limit(con): expr = con.sql('SELECT * FROM (SELECT \'abc\' "ts") "x"', dialect="duckdb").limit(1) result = expr.execute() @@ -294,7 +278,23 @@ def test_dot_sql_limit(con): assert result.iat[0, 0] == "abc" -@dot_sql_never +@pytest.fixture(scope="module") +def mem_t(con): + if con.name == "druid": + pytest.xfail("druid does not support create_table") + + name = ibis.util.gen_name(con.name) + + # flink only supports memtables if `temp` is True, seems like we should + # address that for users + con.create_table( + name, ibis.memtable({"a": list("def")}), temp=con.name == "flink" or None + ) + yield name + with contextlib.suppress(NotImplementedError): + con.drop_table(name, force=True) + + @pytest.mark.notyet( ["druid"], raises=KeyError, @@ -318,7 +318,6 @@ def test_cte(alltypes, df): tm.assert_frame_equal(result, expected) -@dot_sql_never def test_bare_minimum(alltypes, df, ftname_raw): """Test that a backend that supports dot sql can do the most basic thing.""" @@ -326,7 +325,6 @@ def test_bare_minimum(alltypes, df, ftname_raw): assert expr.to_pandas().iat[0, 0] == len(df) -@dot_sql_never def test_embedded_cte(alltypes, ftname_raw): sql = f'WITH "x" AS (SELECT * FROM "{ftname_raw}") SELECT * FROM "x"' expr = alltypes.sql(sql, dialect="duckdb") diff --git a/ibis/backends/tests/test_export.py b/ibis/backends/tests/test_export.py index 219935abf5e98..112bcbf3f128c 100644 --- a/ibis/backends/tests/test_export.py +++ b/ibis/backends/tests/test_export.py @@ -27,10 +27,7 @@ pd = pytest.importorskip("pandas") pa = pytest.importorskip("pyarrow") -limit = [ - # limit not implemented for pandas-family backends - param(42, id="limit", marks=pytest.mark.notimpl(["pandas"])), -] +limit = [param(42, id="limit")] no_limit = [param(None, id="nolimit")] @@ -138,7 +135,7 @@ def test_column_to_pyarrow_table_schema(awards_players): assert array.type == pa.string() or array.type == pa.large_string() -@pytest.mark.notimpl(["pandas", "datafusion", "flink"]) +@pytest.mark.notimpl(["datafusion", "flink"]) @pytest.mark.notyet( ["clickhouse"], raises=AssertionError, @@ -153,7 +150,7 @@ def test_table_pyarrow_batch_chunk_size(awards_players): util.consume(batch_reader) -@pytest.mark.notimpl(["pandas", "datafusion", "flink"]) +@pytest.mark.notimpl(["datafusion", "flink"]) @pytest.mark.notyet( ["clickhouse"], raises=AssertionError, @@ -170,7 +167,6 @@ def test_column_pyarrow_batch_chunk_size(awards_players): util.consume(batch_reader) -@pytest.mark.notimpl(["pandas"]) @pytest.mark.notimpl( ["sqlite"], raises=pa.ArrowException, @@ -270,7 +266,6 @@ def test_table_to_parquet_writer_kwargs(version, tmp_path, backend, awards_playe "mssql", "mysql", "oracle", - "pandas", "polars", "postgres", "risingwave", @@ -423,7 +418,7 @@ def test_to_pyarrow_decimal(backend, dtype, pyarrow_dtype): reason="read_delta not yet implemented", ) @pytest.mark.notyet(["clickhouse"], raises=Exception) -@pytest.mark.notyet(["mssql", "pandas"], raises=PyDeltaTableError) +@pytest.mark.notyet(["mssql"], raises=PyDeltaTableError) def test_roundtrip_delta(backend, con, alltypes, tmp_path, monkeypatch): if con.name == "pyspark": pytest.importorskip("delta") diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 1fb5c7b43debf..6b9df11588195 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -16,7 +16,6 @@ import ibis.expr.datatypes as dt import ibis.selectors as s from ibis import _ -from ibis.backends.conftest import is_newer_than, is_older_than from ibis.backends.tests.errors import ( ClickHouseDatabaseError, ExaQueryError, @@ -355,7 +354,7 @@ def test_filter(backend, alltypes, sorted_df, predicate_fn, expected_fn): raises=PyDruidProgrammingError, reason="requires enabling window functions", ) -@pytest.mark.notimpl(["polars", "pandas"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError) @pytest.mark.notyet( ["oracle"], raises=OracleDatabaseError, @@ -570,7 +569,7 @@ def test_order_by(backend, alltypes, df, key, df_kwargs): backend.assert_frame_equal(result, expected) -@pytest.mark.notimpl(["pandas", "polars", "mssql", "druid"]) +@pytest.mark.notimpl(["polars", "mssql", "druid"]) @pytest.mark.notimpl( ["risingwave"], raises=PsycoPg2InternalError, @@ -707,19 +706,12 @@ def test_order_by_two_cols_nulls(con, op1, nf1, nf2, op2, expected): getattr(t["col2"], op2)(nulls_first=nf2), ) - if con.name == "pandas" and nf1 != nf2: - with pytest.raises( - ValueError, - match=f"{con.name} does not support specifying null ordering for individual column", - ): - con.execute(expr) - else: - result = con.execute(expr).reset_index(drop=True) - expected = pd.DataFrame(expected) + result = con.execute(expr).reset_index(drop=True) + expected = pd.DataFrame(expected) - tm.assert_frame_equal( - result.replace({np.nan: None}), expected.replace({np.nan: None}) - ) + tm.assert_frame_equal( + result.replace({np.nan: None}), expected.replace({np.nan: None}) + ) @pytest.mark.notyet( @@ -819,11 +811,6 @@ def test_table_info_large(con): raises=(OracleDatabaseError, com.OperationNotDefinedError), reason="Mode is not supported and ORA-02000: missing AS keyword", ), - pytest.mark.notimpl( - ["pandas"], - condition=is_newer_than("pandas", "2.1.0"), - reason="FutureWarning: concat empty or all-NA entries is deprecated", - ), pytest.mark.notyet( ["polars"], raises=PolarsSchemaError, @@ -1090,7 +1077,7 @@ def test_int_scalar(alltypes): assert result.dtype == np.int32 -@pytest.mark.notimpl(["datafusion", "pandas", "polars", "druid"]) +@pytest.mark.notimpl(["datafusion", "polars", "druid"]) @pytest.mark.notyet( ["clickhouse"], reason="https://github.com/ClickHouse/ClickHouse/issues/6697" ) @@ -1107,17 +1094,7 @@ def test_exists(batting, awards_players, method_name): @pytest.mark.notimpl( - [ - "datafusion", - "mssql", - "mysql", - "pandas", - "pyspark", - "polars", - "druid", - "oracle", - "exasol", - ], + ["datafusion", "mssql", "mysql", "pyspark", "polars", "druid", "oracle", "exasol"], raises=com.OperationNotDefinedError, ) def test_typeof(con): @@ -1131,7 +1108,7 @@ def test_typeof(con): @pytest.mark.notimpl(["polars"], reason="incorrect answer") @pytest.mark.notyet(["impala"], reason="can't find table in subquery") @pytest.mark.notimpl(["datafusion", "druid"]) -@pytest.mark.notimpl(["pyspark"], condition=is_older_than("pyspark", "3.5.0")) +@pytest.mark.xfail_version(pyspark=["pyspark<3.5"]) @pytest.mark.notyet(["exasol"], raises=ExaQueryError, reason="not supported by exasol") @pytest.mark.notyet( ["risingwave"], @@ -1546,7 +1523,7 @@ def test_distinct_on_keep_is_none(backend, on): assert len(result) == len(expected) -@pytest.mark.notimpl(["pandas", "risingwave", "flink", "exasol"]) +@pytest.mark.notimpl(["risingwave", "flink", "exasol"]) @pytest.mark.notyet( [ "sqlite", @@ -1611,7 +1588,6 @@ def test_hash(backend, alltypes, dtype): "flink", "impala", "mysql", - "pandas", "polars", "postgres", "pyspark", @@ -1642,7 +1618,6 @@ def hash_256(col): "impala", "mysql", "oracle", - "pandas", "polars", "postgres", "risingwave", @@ -1684,7 +1659,6 @@ def hash_256(col): [0, 1, 2], ["0", "1", "2"], marks=[ - pytest.mark.notimpl(["pandas"], reason="casts to ['0']"), pytest.mark.notimpl(["druid"], raises=PyDruidProgrammingError), pytest.mark.notimpl(["oracle"], raises=OracleDatabaseError), pytest.mark.notyet(["bigquery"], raises=GoogleBadRequest), @@ -1723,7 +1697,7 @@ def test_cast(con, from_type, to_type, from_val, expected): assert result == expected -@pytest.mark.notimpl(["pandas", "oracle", "sqlite"]) +@pytest.mark.notimpl(["oracle", "sqlite"]) @pytest.mark.parametrize( ("from_val", "to_type", "expected"), [ @@ -1770,7 +1744,6 @@ def test_try_cast(con, from_val, to_type, expected): "exasol", "mysql", "oracle", - "pandas", "postgres", "risingwave", "sqlite", @@ -1803,7 +1776,6 @@ def test_try_cast_null(con, from_val, to_type): @pytest.mark.notimpl( [ - "pandas", "datafusion", "druid", "mysql", @@ -1829,7 +1801,6 @@ def test_try_cast_table(backend, con): @pytest.mark.notimpl( [ - "pandas", "datafusion", "mysql", "oracle", @@ -2355,7 +2326,6 @@ def test_select_sort_sort_deferred(backend, alltypes, df): backend.assert_frame_equal(result, expected) -@pytest.mark.notimpl(["pandas"], raises=IndexError, reason="NaN isn't treated as NULL") @pytest.mark.notimpl( ["druid"], raises=AttributeError, @@ -2374,11 +2344,6 @@ def test_topk_counts_null(con): raises=AssertionError, reason="ClickHouse returns False for x.isin([None])", ) -@pytest.mark.notimpl( - ["pandas"], - raises=AssertionError, - reason="null isin semantics are not implemented for pandas", -) @pytest.mark.never( "mssql", raises=AssertionError, diff --git a/ibis/backends/tests/test_interactive.py b/ibis/backends/tests/test_interactive.py index ab4503a4aafa3..6acda4055866e 100644 --- a/ibis/backends/tests/test_interactive.py +++ b/ibis/backends/tests/test_interactive.py @@ -33,7 +33,7 @@ def table(backend): return backend.functional_alltypes -@pytest.mark.notimpl(["pandas", "polars"]) +@pytest.mark.notimpl(["polars"]) def test_interactive_execute_on_repr(table, queries): repr(table.bigint_col.sum()) assert len(queries) >= 1 @@ -54,21 +54,21 @@ def test_repr_png_is_not_none_in_not_interactive(table): assert table._repr_png_() is not None -@pytest.mark.notimpl(["pandas", "polars"]) +@pytest.mark.notimpl(["polars"]) def test_default_limit(table, queries): repr(table.select("id", "bool_col")) assert len(queries) >= 1 -@pytest.mark.notimpl(["pandas", "polars"]) +@pytest.mark.notimpl(["polars"]) def test_respect_set_limit(table, queries): repr(table.select("id", "bool_col").limit(10)) assert len(queries) >= 1 -@pytest.mark.notimpl(["pandas", "polars"]) +@pytest.mark.notimpl(["polars"]) def test_disable_query_limit(table, queries): assert ibis.options.sql.default_limit is None diff --git a/ibis/backends/tests/test_json.py b/ibis/backends/tests/test_json.py index 42af343067795..ae2374d8dcd1b 100644 --- a/ibis/backends/tests/test_json.py +++ b/ibis/backends/tests/test_json.py @@ -63,7 +63,7 @@ def test_json_getitem_array(json_t): assert result == expected -@pytest.mark.notimpl(["mysql", "pandas", "risingwave"]) +@pytest.mark.notimpl(["mysql", "risingwave"]) @pytest.mark.notyet(["bigquery", "sqlite"], reason="doesn't support maps") @pytest.mark.notyet(["postgres"], reason="only supports map") @pytest.mark.notyet( @@ -85,7 +85,7 @@ def test_json_map(backend, json_t): backend.assert_series_equal(result, expected) -@pytest.mark.notimpl(["mysql", "pandas", "risingwave"]) +@pytest.mark.notimpl(["mysql", "risingwave"]) @pytest.mark.notyet(["sqlite"], reason="doesn't support arrays") @pytest.mark.notyet( ["pyspark", "flink"], reason="should work but doesn't deserialize JSON" @@ -107,7 +107,7 @@ def test_json_array(backend, json_t): condition=vparse(sqlite3.sqlite_version) < vparse("3.38.0"), reason="JSON not supported in SQLite < 3.38.0", ) -@pytest.mark.notimpl(["pandas", "risingwave"]) +@pytest.mark.notimpl(["risingwave"]) @pytest.mark.notyet(["flink"], reason="should work but doesn't deserialize JSON") @pytest.mark.parametrize( ("typ", "expected_data"), diff --git a/ibis/backends/tests/test_map.py b/ibis/backends/tests/test_map.py index 2a4f6f0cea186..851c0810732a0 100644 --- a/ibis/backends/tests/test_map.py +++ b/ibis/backends/tests/test_map.py @@ -41,7 +41,6 @@ @pytest.mark.notyet("clickhouse", reason="nested types can't be NULL") -@pytest.mark.notimpl(["pandas"], reason="TypeError: iteration over a 0-d array") @pytest.mark.notimpl( ["risingwave"], raises=PsycoPg2InternalError, @@ -63,7 +62,6 @@ def test_map_nulls(con, k, v): @pytest.mark.notyet("clickhouse", reason="nested types can't be NULL") -@pytest.mark.notimpl(["pandas"], reason="TypeError: iteration over a 0-d array") @pytest.mark.notimpl( ["risingwave"], raises=PsycoPg2InternalError, @@ -96,11 +94,6 @@ def test_map_keys_nulls(con, k, v): ibis.map( ibis.literal(["a", "b"]), ibis.literal(None, type="array") ), - marks=[ - pytest.mark.notimpl( - ["pandas"], reason="TypeError: iteration over a 0-d array" - ) - ], id="null_values", ), param( @@ -108,11 +101,6 @@ def test_map_keys_nulls(con, k, v): ibis.literal(None, type="array"), ibis.literal(None, type="array"), ), - marks=[ - pytest.mark.notimpl( - ["pandas"], reason="TypeError: iteration over a 0-d array" - ) - ], id="null_both", ), param(ibis.literal(None, type="map"), id="null_map"), @@ -136,11 +124,6 @@ def test_map_values_nulls(con, map): ), ibis.literal(None, type="string"), marks=[ - pytest.mark.notimpl( - ["pandas"], - reason="result is False instead of None", - strict=False, # passes for contains, but not for get - ), pytest.mark.notimpl( "flink", raises=AssertionError, @@ -157,10 +140,7 @@ def test_map_values_nulls(con, map): ), "a", marks=[ - pytest.mark.notyet("clickhouse", reason="nested types can't be NULL"), - pytest.mark.notimpl( - ["pandas"], reason="TypeError: iteration over a 0-d array" - ), + pytest.mark.notyet("clickhouse", reason="nested types can't be NULL") ], id="null_both_non_null_key", ), @@ -172,9 +152,6 @@ def test_map_values_nulls(con, map): ibis.literal(None, type="string"), marks=[ pytest.mark.notyet("clickhouse", reason="nested types can't be NULL"), - pytest.mark.notimpl( - ["pandas"], reason="TypeError: iteration over a 0-d array" - ), ], id="null_both_null_key", ), @@ -233,14 +210,12 @@ def test_map_merge_nulls(con, m1, m2): assert con.execute(concatted) is None -@pytest.mark.notimpl(["pandas"]) def test_map_table(backend): table = backend.map assert table.kv.type().is_map() assert not table.limit(1).execute().empty -@pytest.mark.notimpl(["pandas"]) @mark_notimpl_risingwave_hstore def test_column_map_values(backend): table = backend.map @@ -250,7 +225,6 @@ def test_column_map_values(backend): backend.assert_series_equal(result, expected) -@pytest.mark.notimpl(["pandas"]) def test_column_map_merge(backend): table = backend.map expr = table.select( @@ -402,7 +376,6 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): pytest.mark.notyet( "clickhouse", reason="only supports str,int,bool,timestamp keys" ), - pytest.mark.notimpl(["pandas"], reason="DateFromYMD isn't implemented"), mark_notyet_postgres, mark_notyet_snowflake, ], @@ -414,7 +387,6 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): pytest.mark.notyet( "clickhouse", reason="only supports str,int,bool,timestamp keys" ), - pytest.mark.notyet(["pandas"]), mark_notyet_postgres, mark_notyet_snowflake, ], @@ -426,7 +398,6 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): pytest.mark.notyet( "clickhouse", reason="only supports str,int,bool,timestamp keys" ), - pytest.mark.notyet(["pandas"]), mark_notyet_postgres, pytest.mark.notyet( ["flink"], @@ -475,10 +446,7 @@ def test_literal_map_getitem_broadcast(backend, alltypes, df): ), pytest.param( [ibis.date(2021, 1, 1), ibis.date(2022, 2, 2)], - marks=[ - pytest.mark.notimpl(["pandas"], reason="DateFromYMD isn't implemented"), - mark_notyet_postgres, - ], + marks=[mark_notyet_postgres], id="date", ), pytest.param( diff --git a/ibis/backends/tests/test_network.py b/ibis/backends/tests/test_network.py index 33e1b2c997eb9..8cc08b7dcb08c 100644 --- a/ibis/backends/tests/test_network.py +++ b/ibis/backends/tests/test_network.py @@ -52,7 +52,6 @@ def test_macaddr_literal(con, backend): "impala": "127.0.0.1", "postgres": "127.0.0.1", "risingwave": "127.0.0.1", - "pandas": "127.0.0.1", "pyspark": "127.0.0.1", "mysql": "127.0.0.1", "mssql": "127.0.0.1", @@ -85,7 +84,6 @@ def test_macaddr_literal(con, backend): "impala": "2001:db8::1", "postgres": "2001:db8::1", "risingwave": "2001:db8::1", - "pandas": "2001:db8::1", "pyspark": "2001:db8::1", "mysql": "2001:db8::1", "mssql": "2001:db8::1", diff --git a/ibis/backends/tests/test_numeric.py b/ibis/backends/tests/test_numeric.py index 0e5579e72ce99..d5c16e9d08ac1 100644 --- a/ibis/backends/tests/test_numeric.py +++ b/ibis/backends/tests/test_numeric.py @@ -251,7 +251,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "impala": decimal.Decimal("1"), "postgres": decimal.Decimal("1.1"), "risingwave": decimal.Decimal("1.1"), - "pandas": decimal.Decimal("1.1"), "pyspark": decimal.Decimal("1.1"), "mysql": decimal.Decimal("1"), "mssql": decimal.Decimal("1"), @@ -294,7 +293,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "impala": decimal.Decimal("1.1"), "postgres": decimal.Decimal("1.1"), "risingwave": decimal.Decimal("1.1"), - "pandas": decimal.Decimal("1.1"), "pyspark": decimal.Decimal("1.1"), "mysql": decimal.Decimal("1.1"), "clickhouse": decimal.Decimal("1.1"), @@ -328,7 +326,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "sqlite": decimal.Decimal("1.1"), "postgres": decimal.Decimal("1.1"), "risingwave": decimal.Decimal("1.1"), - "pandas": decimal.Decimal("1.1"), "pyspark": decimal.Decimal("1.1"), "clickhouse": decimal.Decimal( "1.10000000000000003193790845333396190208" @@ -383,7 +380,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "sqlite": decimal.Decimal("Infinity"), "postgres": decimal.Decimal("Infinity"), "risingwave": decimal.Decimal("Infinity"), - "pandas": decimal.Decimal("Infinity"), "pyspark": decimal.Decimal("Infinity"), "exasol": float("inf"), "duckdb": float("inf"), @@ -447,7 +443,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "sqlite": decimal.Decimal("-Infinity"), "postgres": decimal.Decimal("-Infinity"), "risingwave": decimal.Decimal("-Infinity"), - "pandas": decimal.Decimal("-Infinity"), "pyspark": decimal.Decimal("-Infinity"), "exasol": float("-inf"), "duckdb": float("-inf"), @@ -512,7 +507,6 @@ def test_numeric_literal(con, backend, expr, expected_types): "sqlite": None, "postgres": float("nan"), "risingwave": float("nan"), - "pandas": decimal.Decimal("NaN"), "pyspark": decimal.Decimal("NaN"), "exasol": float("nan"), "duckdb": float("nan"), @@ -1301,7 +1295,7 @@ def test_divide_by_zero(backend, alltypes, df, column, denominator): backend.assert_series_equal(result.astype("float64"), expected) -@pytest.mark.notimpl(["pandas", "polars"], raises=com.OperationNotDefinedError) +@pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError) @pytest.mark.notimpl(["druid"], raises=PyDruidProgrammingError) @pytest.mark.notimpl( ["risingwave"], diff --git a/ibis/backends/tests/test_register.py b/ibis/backends/tests/test_register.py index 05c2f9f5b9360..02df85ef1cb0a 100644 --- a/ibis/backends/tests/test_register.py +++ b/ibis/backends/tests/test_register.py @@ -89,7 +89,6 @@ def gzip_csv(data_dir, tmp_path): "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "snowflake", @@ -117,7 +116,6 @@ def test_register_csv(con, data_dir, fname, in_table_name, out_table_name): "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "snowflake", @@ -142,7 +140,6 @@ def test_register_csv_gz(con, data_dir, gzip_csv): "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "snowflake", @@ -200,7 +197,6 @@ def read_table(path: Path) -> Iterator[tuple[str, pa.Table]]: "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "snowflake", @@ -238,7 +234,6 @@ def test_register_parquet( "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "pyspark", @@ -284,7 +279,6 @@ def test_register_iterator_parquet( "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "pyspark", @@ -319,7 +313,6 @@ def test_register_pandas(con): "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "pyspark", @@ -345,7 +338,6 @@ def test_register_pyarrow_tables(con): "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "snowflake", @@ -385,7 +377,6 @@ def test_csv_reregister_schema(con, tmp_path): "impala", "mysql", "mssql", - "pandas", "polars", "postgres", "risingwave", @@ -453,7 +444,6 @@ def ft_data(data_dir): "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "sqlite", @@ -482,7 +472,6 @@ def test_read_parquet_glob(con, tmp_path, ft_data): "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "sqlite", @@ -512,7 +501,6 @@ def test_read_csv_glob(con, tmp_path, ft_data): "impala", "mssql", "mysql", - "pandas", "postgres", "risingwave", "sqlite", diff --git a/ibis/backends/tests/test_set_ops.py b/ibis/backends/tests/test_set_ops.py index b6451f017b872..60391f983525c 100644 --- a/ibis/backends/tests/test_set_ops.py +++ b/ibis/backends/tests/test_set_ops.py @@ -70,15 +70,7 @@ def test_union_mixed_distinct(backend, union_subsets): False, marks=[ pytest.mark.notyet( - [ - "impala", - "bigquery", - "pandas", - "sqlite", - "snowflake", - "mssql", - "exasol", - ], + ["impala", "bigquery", "sqlite", "snowflake", "mssql", "exasol"], reason="backend doesn't support INTERSECT ALL", ), pytest.mark.notimpl( @@ -123,15 +115,7 @@ def test_intersect(backend, alltypes, df, distinct): False, marks=[ pytest.mark.notyet( - [ - "impala", - "bigquery", - "pandas", - "sqlite", - "snowflake", - "mssql", - "exasol", - ], + ["impala", "bigquery", "sqlite", "snowflake", "mssql", "exasol"], reason="backend doesn't support EXCEPT ALL", ), pytest.mark.notimpl( @@ -223,15 +207,7 @@ def test_top_level_union(backend, con, alltypes, distinct, ordered): False, marks=[ pytest.mark.notimpl( - [ - "impala", - "bigquery", - "mssql", - "pandas", - "snowflake", - "sqlite", - "exasol", - ] + ["impala", "bigquery", "mssql", "snowflake", "sqlite", "exasol"] ), pytest.mark.notimpl( ["risingwave"], diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index c09b0412d0b1d..fe30396c762ee 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -44,12 +44,12 @@ ), ], ) -@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["polars"], reason="not SQL", raises=ValueError) def test_literal(backend, expr): assert "432" in ibis.to_sql(expr, dialect=backend.name()) -@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["polars"], reason="not SQL", raises=ValueError) def test_group_by_has_index(backend, snapshot): countries = ibis.table( dict(continent="string", population="int64"), name="countries" @@ -72,7 +72,7 @@ def test_group_by_has_index(backend, snapshot): snapshot.assert_match(sql, "out.sql") -@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["polars"], reason="not SQL", raises=ValueError) def test_cte_refs_in_topo_order(backend, snapshot): mr0 = ibis.table(schema=ibis.schema(dict(key="int")), name="leaf") @@ -85,7 +85,7 @@ def test_cte_refs_in_topo_order(backend, snapshot): snapshot.assert_match(sql, "out.sql") -@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["polars"], reason="not SQL", raises=ValueError) def test_isin_bug(con, snapshot): t = ibis.table(dict(x="int"), name="t") good = t.filter(t.x > 2).x @@ -93,7 +93,7 @@ def test_isin_bug(con, snapshot): snapshot.assert_match(str(ibis.to_sql(expr, dialect=con.name)), "out.sql") -@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["polars"], reason="not SQL", raises=ValueError) @pytest.mark.notyet( ["exasol", "oracle", "flink"], reason="no unnest support", @@ -158,7 +158,7 @@ def test_union_aliasing(backend_name, snapshot): snapshot.assert_match(str(ibis.to_sql(result, dialect=backend_name)), "out.sql") -@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=ValueError) +@pytest.mark.never(["polars"], reason="not SQL", raises=ValueError) @pytest.mark.parametrize( "value", [ @@ -182,7 +182,7 @@ def test_selects_with_impure_operations_not_merged(con, snapshot, value): snapshot.assert_match(sql, "out.sql") -@pytest.mark.never(["pandas", "polars"], reason="not SQL", raises=NotImplementedError) +@pytest.mark.never(["polars"], reason="not SQL", raises=NotImplementedError) def test_to_sql_default_backend(con, snapshot, monkeypatch): monkeypatch.setattr(ibis.options, "default_backend", con) @@ -191,9 +191,7 @@ def test_to_sql_default_backend(con, snapshot, monkeypatch): snapshot.assert_match(ibis.to_sql(expr), "to_sql.sql") -@pytest.mark.notimpl( - ["pandas", "polars"], raises=ValueError, reason="not a SQL backend" -) +@pytest.mark.notimpl(["polars"], raises=ValueError, reason="not a SQL backend") def test_many_subqueries(backend_name, snapshot): def query(t, group_cols): t2 = t.mutate(key=ibis.row_number().over(ibis.window(order_by=group_cols))) @@ -208,9 +206,7 @@ def query(t, group_cols): @pytest.mark.parametrize("backend_name", _get_backends_to_test()) -@pytest.mark.notimpl( - ["pandas", "polars"], raises=ValueError, reason="not a SQL backend" -) +@pytest.mark.notimpl(["polars"], raises=ValueError, reason="not a SQL backend") def test_mixed_qualified_and_unqualified_predicates(backend_name, snapshot): t = ibis.table({"x": "int64"}, name="t") expr = t.mutate(y=t.x.sum().over(ibis.window())).filter( @@ -228,9 +224,7 @@ def test_mixed_qualified_and_unqualified_predicates(backend_name, snapshot): @pytest.mark.parametrize("backend_name", _get_backends_to_test()) -@pytest.mark.notimpl( - ["pandas", "polars"], raises=ValueError, reason="not a SQL backend" -) +@pytest.mark.notimpl(["polars"], raises=ValueError, reason="not a SQL backend") def test_rewrite_context(snapshot, backend_name): table = ibis.memtable({"test": [1, 2, 3, 4, 5]}, name="test") expr = table.select(new_col=ibis.ntile(2).over(order_by=ibis.random())).limit(10) diff --git a/ibis/backends/tests/test_string.py b/ibis/backends/tests/test_string.py index ab194343acfa1..889c42206fa0c 100644 --- a/ibis/backends/tests/test_string.py +++ b/ibis/backends/tests/test_string.py @@ -748,7 +748,7 @@ def test_substr_with_null_values(backend, alltypes, df): id="file", marks=[ pytest.mark.notimpl( - ["pandas", "datafusion", "sqlite"], + ["datafusion", "sqlite"], raises=com.OperationNotDefinedError, ), ], @@ -824,17 +824,7 @@ def test_capitalize(con, inp, expected): @pytest.mark.notimpl( - [ - "pandas", - "polars", - "oracle", - "flink", - "sqlite", - "mssql", - "mysql", - "exasol", - "impala", - ], + ["polars", "oracle", "flink", "sqlite", "mssql", "mysql", "exasol", "impala"], raises=com.OperationNotDefinedError, ) def test_array_string_join(con): @@ -870,7 +860,6 @@ def test_multiple_subs(con): "impala", "mssql", "mysql", - "pandas", "polars", "sqlite", "flink", @@ -922,7 +911,6 @@ def test_non_match_regex_search_is_false(con): "oracle", "flink", "exasol", - "pandas", "bigquery", ], raises=com.OperationNotDefinedError, @@ -944,7 +932,6 @@ def test_re_split(con): "oracle", "flink", "exasol", - "pandas", "bigquery", ], raises=com.OperationNotDefinedError, @@ -965,7 +952,6 @@ def test_re_split_column(alltypes): "oracle", "flink", "exasol", - "pandas", "bigquery", ], raises=com.OperationNotDefinedError, @@ -1004,7 +990,6 @@ def test_re_split_column_multiple_patterns(alltypes): [lambda n: n + "a", lambda n: n + n, lambda n: "a" + n], ids=["null-a", "null-null", "a-null"], ) -@pytest.mark.notimpl(["pandas"], raises=TypeError) def test_concat_with_null(con, fn): null = ibis.literal(None, type="string") expr = fn(null) @@ -1026,7 +1011,6 @@ def test_concat_with_null(con, fn): [lambda args: args[0].concat(*args[1:]), lambda args: reduce(add, args)], ids=["concat", "add"], ) -@pytest.mark.notimpl(["pandas"], raises=TypeError) def test_concat(con, args, method): expr = method(args) assert pd.isna(con.execute(expr)) diff --git a/ibis/backends/tests/test_struct.py b/ibis/backends/tests/test_struct.py index 009784aa88a24..3098e349baca4 100644 --- a/ibis/backends/tests/test_struct.py +++ b/ibis/backends/tests/test_struct.py @@ -244,12 +244,6 @@ def test_keyword_fields(con, nullable): raises=PolarsColumnNotFoundError, reason="doesn't seem to support IN-style subqueries on structs", ) -@pytest.mark.notimpl( - # https://github.com/pandas-dev/pandas/issues/58909 - ["pandas"], - raises=TypeError, - reason="unhashable type: 'dict'", -) @pytest.mark.xfail_version( pyspark=["pyspark<3.5"], reason="requires pyspark 3.5", diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py index 5e94c28071f98..6bd9021cad029 100644 --- a/ibis/backends/tests/test_temporal.py +++ b/ibis/backends/tests/test_temporal.py @@ -16,7 +16,6 @@ import ibis.common.exceptions as com import ibis.expr.datatypes as dt from ibis.backends import _get_backend_names -from ibis.backends.conftest import is_older_than from ibis.backends.tests.errors import ( ArrowInvalid, ClickHouseDatabaseError, @@ -243,11 +242,6 @@ def test_timestamp_extract_milliseconds(backend, alltypes, df): raises=GoogleBadRequest, reason="UNIX_SECONDS does not support DATETIME arguments", ) -@pytest.mark.notimpl( - ["pandas"], - raises=AssertionError, - condition=is_older_than("pandas", "2.0.0"), -) def test_timestamp_extract_epoch_seconds(backend, alltypes, df): expr = alltypes.timestamp_col.epoch_seconds().name("tmp") result = expr.execute() @@ -915,25 +909,9 @@ def test_timestamp_comparison_filter(backend, con, alltypes, df, func_name): backend.assert_frame_equal(result, expected) -no_mixed_timestamp_comparisons = [ - pytest.mark.notimpl( - ["pandas"], - raises=TypeError, - reason="Invalid comparison between dtype=datetime64[ns, UTC] and datetime", - ), -] - - @pytest.mark.parametrize( "func_name", - [ - param("gt", marks=no_mixed_timestamp_comparisons), - param("ge", marks=no_mixed_timestamp_comparisons), - param("lt", marks=no_mixed_timestamp_comparisons), - param("le", marks=no_mixed_timestamp_comparisons), - "eq", - "ne", - ], + ["gt", "ge", "lt", "le", "eq", "ne"], ) @pytest.mark.notimpl(["druid"], raises=PyDruidProgrammingError) @pytest.mark.notimpl( @@ -1183,7 +1161,7 @@ def test_integer_to_timestamp(backend, con, unit): ], ) @pytest.mark.notimpl( - ["pandas", "clickhouse", "sqlite", "datafusion", "mssql", "druid"], + ["clickhouse", "sqlite", "datafusion", "mssql", "druid"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl(["exasol"], raises=com.OperationNotDefinedError) @@ -1254,7 +1232,7 @@ def test_string_to_timestamp(alltypes, fmt): ], ) @pytest.mark.notimpl( - ["pandas", "clickhouse", "sqlite", "datafusion", "mssql", "druid"], + ["clickhouse", "sqlite", "datafusion", "mssql", "druid"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl(["exasol"], raises=com.OperationNotDefinedError) @@ -1419,7 +1397,7 @@ def test_date_literal(con, backend): @pytest.mark.notimpl( - ["pandas", "pyspark", "mysql", "exasol", "oracle"], + ["pyspark", "mysql", "exasol", "oracle"], raises=com.OperationNotDefinedError, ) @pytest.mark.notyet(["impala"], raises=com.OperationNotDefinedError) @@ -1436,8 +1414,7 @@ def test_timestamp_literal(con, backend): @pytest.mark.notimpl( - ["pandas", "mysql", "pyspark", "exasol"], - raises=com.OperationNotDefinedError, + ["mysql", "pyspark", "exasol"], raises=com.OperationNotDefinedError ) @pytest.mark.notyet(["impala", "oracle"], raises=com.OperationNotDefinedError) @pytest.mark.parametrize( @@ -1497,7 +1474,7 @@ def test_timestamp_with_timezone_literal(con, timezone, expected): @pytest.mark.notimpl( - ["pandas", "datafusion", "pyspark", "polars", "mysql", "oracle"], + ["datafusion", "pyspark", "polars", "mysql", "oracle"], raises=com.OperationNotDefinedError, ) @pytest.mark.notyet( @@ -1634,7 +1611,7 @@ def test_date_column_from_ymd(backend, con, alltypes, df): @pytest.mark.notimpl( - ["pandas", "pyspark", "mysql", "exasol"], raises=com.OperationNotDefinedError + ["pyspark", "mysql", "exasol"], raises=com.OperationNotDefinedError ) @pytest.mark.notyet(["impala", "oracle"], raises=com.OperationNotDefinedError) def test_timestamp_column_from_ymdhms(backend, con, alltypes, df): @@ -1947,7 +1924,7 @@ def test_delta(con, start, end, unit, expected): @pytest.mark.notimpl( - ["impala", "mysql", "pandas", "pyspark", "sqlite", "trino", "druid"], + ["impala", "mysql", "pyspark", "sqlite", "trino", "druid"], raises=com.OperationNotDefinedError, ) @pytest.mark.parametrize( @@ -2049,17 +2026,7 @@ def test_timestamp_bucket(backend, kws, pd_freq): @pytest.mark.notimpl( - [ - "datafusion", - "impala", - "mysql", - "oracle", - "pandas", - "pyspark", - "sqlite", - "trino", - "druid", - ], + ["datafusion", "impala", "mysql", "oracle", "pyspark", "sqlite", "trino", "druid"], raises=com.OperationNotDefinedError, ) @pytest.mark.notimpl( @@ -2085,7 +2052,7 @@ def test_timestamp_bucket_offset(backend, offset_mins): backend.assert_series_equal(res, sol) -_NO_SQLGLOT_DIALECT = ("pandas", "flink", "polars") +_NO_SQLGLOT_DIALECT = ("flink", "polars") no_sqlglot_dialect = sorted( param(backend, marks=pytest.mark.xfail) for backend in _NO_SQLGLOT_DIALECT ) @@ -2154,12 +2121,6 @@ def test_time_literal_sql(dialect, snapshot, micros): reason="clickhouse doesn't support dates after 2149-06-06", ), pytest.mark.notyet(["datafusion"], raises=Exception), - pytest.mark.notyet( - ["pandas"], - condition=is_older_than("pandas", "2.0.0"), - raises=ValueError, - reason="Out of bounds nanosecond timestamp: 9999-01-02 00:00:00", - ), ], id="large", ), @@ -2173,12 +2134,6 @@ def test_time_literal_sql(dialect, snapshot, micros): reason="clickhouse doesn't support dates before the UNIX epoch", ), pytest.mark.notyet(["datafusion"], raises=Exception), - pytest.mark.notyet( - ["pandas"], - condition=is_older_than("pandas", "2.0.0"), - raises=ValueError, - reason="Out of bounds nanosecond timestamp: 1-07-17 00:00:00", - ), ], ), param( diff --git a/ibis/backends/tests/test_udf.py b/ibis/backends/tests/test_udf.py index ffdc6ca2437e3..3713a4cd3058d 100644 --- a/ibis/backends/tests/test_udf.py +++ b/ibis/backends/tests/test_udf.py @@ -18,7 +18,6 @@ "mssql", "mysql", "oracle", - "pandas", "trino", "risingwave", ] diff --git a/ibis/backends/tests/test_uuid.py b/ibis/backends/tests/test_uuid.py index 8768b0e137e01..85e72db454a92 100644 --- a/ibis/backends/tests/test_uuid.py +++ b/ibis/backends/tests/test_uuid.py @@ -42,7 +42,7 @@ def test_uuid_literal(con, backend): @pytest.mark.notimpl( - ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave", "pandas"], + ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave"], raises=com.OperationNotDefinedError, ) @pytest.mark.never( @@ -55,7 +55,7 @@ def test_uuid_function(con): @pytest.mark.notimpl( - ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave", "pandas"], + ["druid", "exasol", "oracle", "polars", "pyspark", "risingwave"], raises=com.OperationNotDefinedError, ) def test_uuid_unique_each_row(con): diff --git a/ibis/backends/tests/test_vectorized_udf.py b/ibis/backends/tests/test_vectorized_udf.py index b119e382f539d..9a3ffb6120f9c 100644 --- a/ibis/backends/tests/test_vectorized_udf.py +++ b/ibis/backends/tests/test_vectorized_udf.py @@ -6,7 +6,6 @@ import ibis import ibis.common.exceptions as com import ibis.expr.datatypes as dt -from ibis.backends.conftest import is_older_than from ibis.legacy.udf.vectorized import analytic, elementwise, reduction np = pytest.importorskip("numpy") @@ -55,9 +54,7 @@ def add_one_udf(s: float) -> float: return result_formatter(add_one(s)) yield param(add_one_legacy, id=f"add_one_legacy_{id}") - yield param( - add_one_udf, marks=[pytest.mark.notimpl(["pandas"])], id=f"add_one_modern_{id}" - ) + yield param(add_one_udf, id=f"add_one_modern_{id}") add_one_udfs = [ @@ -329,11 +326,6 @@ def test_reduction_udf_array_return_type(udf_backend, udf_alltypes, udf_df): udf_backend.assert_frame_equal(result, expected) -@pytest.mark.notyet( - ["pandas"], - condition=is_older_than("pandas", "2.0.0"), - reason="FutureWarning: Not prepending group keys to the result index of transform-like apply", -) def test_reduction_udf_on_empty_data(udf_backend, udf_alltypes): """Test that summarization can handle empty data.""" # First filter down to zero rows diff --git a/ibis/backends/tests/test_window.py b/ibis/backends/tests/test_window.py index cd6bd5f904cac..083254bf2e256 100644 --- a/ibis/backends/tests/test_window.py +++ b/ibis/backends/tests/test_window.py @@ -152,9 +152,7 @@ def calc_zscore(s): lambda t: pandas_ntile(t.float_col, 7), id="ntile", marks=[ - pytest.mark.notimpl( - ["pandas", "polars"], raises=com.OperationNotDefinedError - ), + pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError), pytest.mark.notimpl( ["impala"], raises=AssertionError, @@ -194,7 +192,6 @@ def calc_zscore(s): ), id="nth", marks=[ - pytest.mark.notimpl(["pandas"], raises=com.OperationNotDefinedError), pytest.mark.notyet( ["impala", "mssql"], raises=com.OperationNotDefinedError ), @@ -673,7 +670,6 @@ def test_simple_ungrouped_window_with_scalar_order_by(alltypes): True, id="unordered-ntile", marks=[ - pytest.mark.notimpl(["pandas"], raises=com.OperationNotDefinedError), pytest.mark.notimpl( ["risingwave"], raises=PsycoPg2InternalError, diff --git a/ibis/expr/operations/tests/test_generic.py b/ibis/expr/operations/tests/test_generic.py index e0e5eb4c7a982..4149527a1f9db 100644 --- a/ibis/expr/operations/tests/test_generic.py +++ b/ibis/expr/operations/tests/test_generic.py @@ -93,7 +93,6 @@ def test_coerced_to_value(typehint, value, expected): assert pat.match(value, {}) == expected -@pytest.mark.pandas def test_coerced_to_interval_value(): import pandas as pd diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py index d85fe969b2d28..426d7eff62add 100644 --- a/ibis/tests/benchmarks/test_benchmarks.py +++ b/ibis/tests/benchmarks/test_benchmarks.py @@ -160,7 +160,7 @@ def test_builtins(benchmark, expr_fn, builtin, t, base, large_expr): benchmark(builtin, expr) -_backends = _get_backend_names(exclude=("pandas",)) +_backends = _get_backend_names() _XFAIL_COMPILE_BACKENDS = ("polars",) diff --git a/nix/ibis.nix b/nix/ibis.nix index 4b4fec00af759..f173aac3fa01b 100644 --- a/nix/ibis.nix +++ b/nix/ibis.nix @@ -10,7 +10,7 @@ # well and serially it takes on the order of 7-8 minutes to execute serially let extras = [ "decompiler" "visualization" ]; - backends = [ "datafusion" "duckdb" "pandas" "polars" "sqlite" ]; + backends = [ "datafusion" "duckdb" "polars" "sqlite" ]; in poetry2nix.mkPoetryApplication { python = python3; diff --git a/pyproject.toml b/pyproject.toml index 698a07e74d144..9b21a7034f030 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -259,7 +259,6 @@ impala = "ibis.backends.impala" mysql = "ibis.backends.mysql" mssql = "ibis.backends.mssql" oracle = "ibis.backends.oracle" -pandas = "ibis.backends.pandas" polars = "ibis.backends.polars" postgres = "ibis.backends.postgres" risingwave = "ibis.backends.risingwave" @@ -392,7 +391,6 @@ markers = [ "mysql: MySQL tests", "mssql: MS SQL Server tests", "oracle: Oracle tests", - "pandas: Pandas tests", "polars: Polars tests", "postgres: PostgreSQL tests", "risingwave: RisingWave tests",