Skip to content

Commit

Permalink
feat(bigquery, impala, mssql, oracle, postgres): compile `Table.sampl…
Browse files Browse the repository at this point in the history
…e` to native `TABLESAMPLE` syntax when possible
  • Loading branch information
jcrist committed Sep 24, 2024
1 parent e2235a2 commit abb4c85
Show file tree
Hide file tree
Showing 71 changed files with 522 additions and 9 deletions.
9 changes: 9 additions & 0 deletions ibis/backends/sql/compilers/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
exclude_unsupported_window_frame_from_ops,
exclude_unsupported_window_frame_from_rank,
exclude_unsupported_window_frame_from_row_number,
lower_sample,
split_select_distinct_with_order_by,
)
from ibis.common.temporal import DateUnit, IntervalUnit, TimestampUnit, TimeUnit
Expand Down Expand Up @@ -118,6 +119,14 @@ class BigQueryCompiler(SQLGlotCompiler):

supports_qualify = True

LOWERED_OPS = {
ops.Sample: lower_sample(
supports_methods=("block",),
supports_seed=False,
physical_tables_only=True,
),
}

UNSUPPORTED_OPS = (
ops.DateDiff,
ops.ExtractAuthority,
Expand Down
1 change: 1 addition & 0 deletions ibis/backends/sql/compilers/druid.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class DruidCompiler(SQLGlotCompiler):
ops.TypeOf,
ops.Unnest,
ops.Variance,
ops.Sample,
)

SIMPLE_OPS = {
Expand Down
8 changes: 7 additions & 1 deletion ibis/backends/sql/compilers/impala.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ibis.backends.sql.compilers.base import NULL, STAR, SQLGlotCompiler
from ibis.backends.sql.datatypes import ImpalaType
from ibis.backends.sql.dialects import Impala
from ibis.backends.sql.rewrites import rewrite_empty_order_by_window
from ibis.backends.sql.rewrites import lower_sample, rewrite_empty_order_by_window


class ImpalaCompiler(SQLGlotCompiler):
Expand All @@ -23,6 +23,12 @@ class ImpalaCompiler(SQLGlotCompiler):
*SQLGlotCompiler.rewrites,
)

LOWERED_OPS = {
ops.Sample: lower_sample(
supports_methods=("block",), physical_tables_only=True
),
}

UNSUPPORTED_OPS = (
ops.ArgMax,
ops.ArgMin,
Expand Down
7 changes: 7 additions & 0 deletions ibis/backends/sql/compilers/mssql.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ibis.backends.sql.rewrites import (
exclude_unsupported_window_frame_from_ops,
exclude_unsupported_window_frame_from_row_number,
lower_sample,
p,
replace,
split_select_distinct_with_order_by,
Expand Down Expand Up @@ -73,6 +74,12 @@ class MSSQLCompiler(SQLGlotCompiler):
post_rewrites = (split_select_distinct_with_order_by,)
copy_func_args = True

LOWERED_OPS = {
ops.Sample: lower_sample(
supports_methods=("block",), physical_tables_only=True
),
}

UNSUPPORTED_OPS = (
ops.ApproxMedian,
ops.ArgMax,
Expand Down
2 changes: 2 additions & 0 deletions ibis/backends/sql/compilers/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
exclude_unsupported_window_frame_from_row_number,
lower_log2,
lower_log10,
lower_sample,
rewrite_empty_order_by_window,
)

Expand Down Expand Up @@ -46,6 +47,7 @@ class OracleCompiler(SQLGlotCompiler):
LOWERED_OPS = {
ops.Log2: lower_log2,
ops.Log10: lower_log10,
ops.Sample: lower_sample(physical_tables_only=True),
}

UNSUPPORTED_OPS = (
Expand Down
4 changes: 3 additions & 1 deletion ibis/backends/sql/compilers/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ibis.backends.sql.compilers.base import NULL, STAR, AggGen, SQLGlotCompiler
from ibis.backends.sql.datatypes import PostgresType
from ibis.backends.sql.dialects import Postgres
from ibis.backends.sql.rewrites import split_select_distinct_with_order_by
from ibis.backends.sql.rewrites import lower_sample, split_select_distinct_with_order_by
from ibis.common.exceptions import InvalidDecoratorError
from ibis.util import gen_name

Expand Down Expand Up @@ -50,6 +50,8 @@ class PostgresCompiler(SQLGlotCompiler):
POS_INF = sge.Literal.number("'Inf'::double precision")
NEG_INF = sge.Literal.number("'-Inf'::double precision")

LOWERED_OPS = {ops.Sample: lower_sample(physical_tables_only=True)}

UNSUPPORTED_OPS = (
ops.RowID,
ops.TimeDelta,
Expand Down
1 change: 1 addition & 0 deletions ibis/backends/sql/compilers/risingwave.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class RisingWaveCompiler(PostgresCompiler):
ops.RandomUUID,
ops.MultiQuantile,
ops.ApproxMultiQuantile,
ops.Sample,
*(
op
for op in ALL_OPERATIONS
Expand Down
12 changes: 12 additions & 0 deletions ibis/backends/sql/dialects.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,18 +307,30 @@ class Tokenizer(Hive.Tokenizer):
STRING_ESCAPES = ["'"]


def tablesample_percent_to_int(self, expr):
"""Impala's TABLESAMPLE only supports integer percentages."""
expr = expr.copy()
expr.args["percent"] = sge.convert(round(float(expr.args["percent"].this)))
return self.tablesample_sql(expr)


class Impala(Hive):
NULL_ORDERING = "nulls_are_large"
REGEXP_EXTRACT_DEFAULT_GROUP = 0
TABLESAMPLE_SIZE_IS_PERCENT = True
ALIAS_POST_TABLESAMPLE = False

class Generator(Hive.Generator):
TABLESAMPLE_WITH_METHOD = True

TRANSFORMS = Hive.Generator.TRANSFORMS.copy() | {
sge.ApproxDistinct: rename_func("ndv"),
sge.IsNan: rename_func("is_nan"),
sge.IsInf: rename_func("is_inf"),
sge.DayOfWeek: rename_func("dayofweek"),
sge.Interval: lambda self, e: _interval(self, e, quote_arg=False),
sge.CurrentDate: rename_func("current_date"),
sge.TableSample: tablesample_percent_to_int,
}


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT
*
FROM `test` AS `t0` TABLESAMPLE system (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM `test` AS `t0`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM `test` AS `t0`
WHERE
`t0`.`x` > 10
) AS `t1`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM `test` AS `t0`
WHERE
`t0`.`x` > 10
) AS `t1`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
randCanonical() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
randCanonical() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
randCanonical() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
randCanonical() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT
*
FROM "test" AS "t0" TABLESAMPLE system (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT
*
FROM "test" AS "t0" TABLESAMPLE bernoulli (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1" TABLESAMPLE system (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1" TABLESAMPLE bernoulli (50.0 PERCENT)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM "test" AS "t0"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM "test" AS "t0"
WHERE
"t0"."x" > 10
) AS "t1"
WHERE
RANDOM() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM `test` AS `t0`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM `test` AS `t0`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM `test` AS `t0`
WHERE
`t0`.`x` > 10
) AS `t1`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SELECT
*
FROM (
SELECT
*
FROM `test` AS `t0`
WHERE
`t0`.`x` > 10
) AS `t1`
WHERE
RAND() <= 0.5
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
SELECT
*
FROM `test` AS `t0` TABLESAMPLE system (50)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
SELECT
*
FROM `test` AS `t0`
WHERE
RAND(UTC_TO_UNIX_MICROS(UTC_TIMESTAMP())) <= 0.5
Loading

0 comments on commit abb4c85

Please sign in to comment.