From 7e33a3ef4ad192db2537d3d9a5341826d5ddda42 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 8 Sep 2024 07:04:27 -0400 Subject: [PATCH 01/11] perf(snowflake): speed up memtable existence check by using `DESCRIBE TABLE` --- ibis/backends/snowflake/__init__.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ibis/backends/snowflake/__init__.py b/ibis/backends/snowflake/__init__.py index 5eb378186414..93522f9b2113 100644 --- a/ibis/backends/snowflake/__init__.py +++ b/ibis/backends/snowflake/__init__.py @@ -645,9 +645,23 @@ def list_tables( return self._filter_with_like(tables + views, like=like) def _in_memory_table_exists(self, name: str) -> bool: - with self.con.cursor() as con: - result = con.execute(f"SHOW TABLES LIKE '{name}'").fetchone() - return bool(result) + import snowflake.connector + + ident = sg.to_identifier(name, quoted=self.compiler.quoted) + sql = sg.select(sge.convert(1)).from_(ident).limit(0).sql(self.dialect) + + try: + with self.con.cursor() as cur: + cur.execute(sql).fetchall() + except snowflake.connector.errors.ProgrammingError as e: + # this cryptic error message is the only generic and reliable way + # to tell if the error means "table not found for any reason" + # otherwise, we need to reraise the exception + if e.sqlstate == "42S02": + return False + raise + else: + return True def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: import pyarrow.parquet as pq From 7a3797e78506d74dd913b7372218aeda27e3fc13 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 8 Sep 2024 07:27:57 -0400 Subject: [PATCH 02/11] perf(trino): use `DESCRIBE` for faster table existence checking --- ibis/backends/trino/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ibis/backends/trino/__init__.py b/ibis/backends/trino/__init__.py index 7c4ab32ec77b..0c182f7c0c9a 100644 --- a/ibis/backends/trino/__init__.py +++ b/ibis/backends/trino/__init__.py @@ -552,6 +552,21 @@ def _fetch_from_cursor(self, cursor, schema: sch.Schema) -> pd.DataFrame: df = TrinoPandasData.convert_table(df, schema) return df + def _in_memory_table_exists(self, name: str) -> bool: + ident = sg.to_identifier(name, quoted=self.compiler.quoted) + sql = sg.select(sge.convert(1)).from_(ident).limit(0).sql(self.dialect) + + try: + with self.begin() as cur: + cur.execute(sql) + cur.fetchall() + except trino.exceptions.TrinoUserError as e: + if e.error_name == "TABLE_NOT_FOUND": + return False + raise + else: + return True + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: From 5d5e1ab19db60c6a6bdb0376e80b0137c0c608b3 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 8 Sep 2024 08:11:53 -0400 Subject: [PATCH 03/11] perf(postgres): avoid listing all tables to query table existence --- ibis/backends/postgres/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py index fcfa517a8aff..777b28e00b25 100644 --- a/ibis/backends/postgres/__init__.py +++ b/ibis/backends/postgres/__init__.py @@ -89,6 +89,21 @@ def _from_url(self, url: ParseResult, **kwargs): return self.connect(**kwargs) + def _in_memory_table_exists(self, name: str) -> bool: + import psycopg2.errors + + ident = sg.to_identifier(name, quoted=self.compiler.quoted) + sql = sg.select(sge.convert(1)).from_(ident).limit(0).sql(self.dialect) + + try: + with self.begin() as cur: + cur.execute(sql) + cur.fetchall() + except psycopg2.errors.UndefinedTable: + return False + else: + return True + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: from psycopg2.extras import execute_batch From 0794a5b4b806e420ab2b4bcd9ea55564ca7388a8 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 8 Sep 2024 08:26:48 -0400 Subject: [PATCH 04/11] perf(risingwave): use faster check for existence --- ibis/backends/risingwave/__init__.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ibis/backends/risingwave/__init__.py b/ibis/backends/risingwave/__init__.py index e824d93d93a3..27ae76cf9385 100644 --- a/ibis/backends/risingwave/__init__.py +++ b/ibis/backends/risingwave/__init__.py @@ -260,6 +260,21 @@ def create_table( name, schema=schema, source=self, namespace=ops.Namespace(database=database) ).to_expr() + def _in_memory_table_exists(self, name: str) -> bool: + import psycopg2.errors + + ident = sg.to_identifier(name, quoted=self.compiler.quoted) + sql = sg.select(sge.convert(1)).from_(ident).limit(0).sql(self.dialect) + + try: + with self.begin() as cur: + cur.execute(sql) + cur.fetchall() + except psycopg2.errors.InternalError: + return False + else: + return True + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: From 66922ce09cfb115be17c616c476d1ef60167e401 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Sun, 8 Sep 2024 08:57:28 -0400 Subject: [PATCH 05/11] perf(sqlite): use faster table existence check --- ibis/backends/sqlite/__init__.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/ibis/backends/sqlite/__init__.py b/ibis/backends/sqlite/__init__.py index c52c654486ee..d59a1fbc39dc 100644 --- a/ibis/backends/sqlite/__init__.py +++ b/ibis/backends/sqlite/__init__.py @@ -338,6 +338,18 @@ def _generate_create_table(self, table: sge.Table, schema: sch.Schema): return sge.Create(kind="TABLE", this=target) + def _in_memory_table_exists(self, name: str) -> bool: + ident = sg.to_identifier(name, quoted=self.compiler.quoted) + query = sg.select(sge.convert(1)).from_(ident).limit(0).sql(self.dialect) + try: + with self.begin() as cur: + cur.execute(query) + cur.fetchall() + except sqlite3.OperationalError: + return False + else: + return True + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: table = sg.table(op.name, quoted=self.compiler.quoted, catalog="temp") create_stmt = self._generate_create_table(table, op.schema).sql(self.name) From 69d6e2200378230bf514081634a710393b65ab77 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Mon, 9 Sep 2024 11:56:45 -0400 Subject: [PATCH 06/11] perf(pyspark): faster memtable existence check --- ibis/backends/pyspark/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ibis/backends/pyspark/__init__.py b/ibis/backends/pyspark/__init__.py index def05da78f82..c19958e3db38 100644 --- a/ibis/backends/pyspark/__init__.py +++ b/ibis/backends/pyspark/__init__.py @@ -411,11 +411,18 @@ def _register_udfs(self, expr: ir.Expr) -> None: self._session.udf.register(f"unwrap_json_{typ.__name__}", unwrap_json(typ)) self._session.udf.register("unwrap_json_float", unwrap_json_float) + def _in_memory_table_exists(self, name: str) -> bool: + sql = f"SHOW TABLES IN {self.current_database} LIKE '{name}'" + return bool(self._session.sql(sql).count()) + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = PySparkSchema.from_ibis(op.schema) df = self._session.createDataFrame(data=op.data.to_frame(), schema=schema) df.createTempView(op.name) + def _finalize_memtable(self, name: str) -> None: + self._session.catalog.dropTempView(name) + @contextlib.contextmanager def _safe_raw_sql(self, query: str) -> Any: yield self.raw_sql(query) From a5a0ba7d46bdf6ce508650f56f60616885390688 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 10 Sep 2024 13:58:54 -0400 Subject: [PATCH 07/11] perf(clickhouse): improve table existence check --- ibis/backends/clickhouse/__init__.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ibis/backends/clickhouse/__init__.py b/ibis/backends/clickhouse/__init__.py index 664b27048a75..06b344e014a8 100644 --- a/ibis/backends/clickhouse/__init__.py +++ b/ibis/backends/clickhouse/__init__.py @@ -773,3 +773,23 @@ def create_view( with self._safe_raw_sql(src, external_tables=external_tables): pass return self.table(name, database=database) + + def _in_memory_table_exists(self, name: str) -> bool: + name = sg.table(name, quoted=self.compiler.quoted).sql(self.dialect) + try: + # DESCRIBE TABLE $TABLE FORMAT NULL is the fastest way to check + # table existence in clickhouse; FORMAT NULL produces no data which + # is ideal since we don't care about the output for existence + # checking + # + # Other methods compared were + # 1. SELECT 1 FROM $TABLE LIMIT 0 + # 2. SHOW TABLES LIKE $TABLE LIMIT 1 + # + # if the table exists nothing is returned and there's no error + # otherwise there's an error + self.con.raw_query(f"DESCRIBE {name} FORMAT NULL") + except cc.driver.exceptions.DatabaseError: + return False + else: + return True From 30cc6c49ff76e3c3416d27c0e50446f1c282eb6f Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Tue, 10 Sep 2024 14:13:21 -0400 Subject: [PATCH 08/11] perf(mysql): improve table existence check --- ibis/backends/mysql/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ibis/backends/mysql/__init__.py b/ibis/backends/mysql/__init__.py index eec4df9ac634..4281c81831d0 100644 --- a/ibis/backends/mysql/__init__.py +++ b/ibis/backends/mysql/__init__.py @@ -13,6 +13,7 @@ import pymysql import sqlglot as sg import sqlglot.expressions as sge +from pymysql.constants import ER import ibis import ibis.backends.sql.compilers as sc @@ -465,6 +466,23 @@ def create_table( name, schema=schema, source=self, namespace=ops.Namespace(database=database) ).to_expr() + def _in_memory_table_exists(self, name: str) -> bool: + name = sg.to_identifier(name, quoted=self.compiler.quoted).sql(self.dialect) + # just return the single field with column names; no need to bring back + # everything if the command succeeds + sql = f"SHOW COLUMNS FROM {name} LIKE 'Field'" + try: + with self.begin() as cur: + cur.execute(sql) + cur.fetchall() + except pymysql.err.ProgrammingError as e: + err_code, _ = e.args + if err_code == ER.NO_SUCH_TABLE: + return False + raise + else: + return True + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: From 1c7c29cbc2060f7ac7c0f4d6b5cb16f1908d40e4 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 11 Sep 2024 07:07:35 -0400 Subject: [PATCH 09/11] perf(exasol): speed up existence check --- ibis/backends/exasol/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ibis/backends/exasol/__init__.py b/ibis/backends/exasol/__init__.py index 05456db385f5..84e4fbd2b005 100644 --- a/ibis/backends/exasol/__init__.py +++ b/ibis/backends/exasol/__init__.py @@ -243,6 +243,9 @@ def _get_schema_using_query(self, query: str) -> sch.Schema: finally: self.con.execute(drop_view) + def _in_memory_table_exists(self, name: str) -> bool: + return self.con.meta.table_exists(name) + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: From 7ea0ee4513c386f3764d1b6b979eacd62f25adf8 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 11 Sep 2024 07:14:32 -0400 Subject: [PATCH 10/11] perf(mssql): speed up existence check --- ibis/backends/mssql/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ibis/backends/mssql/__init__.py b/ibis/backends/mssql/__init__.py index e20624e1cd07..737175f95750 100644 --- a/ibis/backends/mssql/__init__.py +++ b/ibis/backends/mssql/__init__.py @@ -703,6 +703,16 @@ def create_table( namespace=ops.Namespace(catalog=catalog, database=db), ).to_expr() + def _in_memory_table_exists(self, name: str) -> bool: + # The single character U here means user-defined table + # see https://learn.microsoft.com/en-us/sql/relational-databases/system-catalog-views/sys-objects-transact-sql?view=sql-server-ver16 + sql = sg.select(sg.func("object_id", sge.convert(name), sge.convert("U"))).sql( + self.dialect + ) + with self.begin() as cur: + [(result,)] = cur.execute(sql).fetchall() + return result is not None + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema if null_columns := [col for col, dtype in schema.items() if dtype.is_null()]: From bcbf310131d01261c57620f5ff4ba7463ced1be7 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Wed, 11 Sep 2024 07:37:02 -0400 Subject: [PATCH 11/11] perf(oracle): speed up existence check --- ibis/backends/oracle/__init__.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/ibis/backends/oracle/__init__.py b/ibis/backends/oracle/__init__.py index 51d9427a2a6c..2481175c600b 100644 --- a/ibis/backends/oracle/__init__.py +++ b/ibis/backends/oracle/__init__.py @@ -24,7 +24,7 @@ from ibis import util from ibis.backends import CanListDatabase, CanListSchema from ibis.backends.sql import SQLBackend -from ibis.backends.sql.compilers.base import STAR, C +from ibis.backends.sql.compilers.base import NULL, STAR, C if TYPE_CHECKING: from urllib.parse import ParseResult @@ -495,6 +495,21 @@ def drop_table( super().drop_table(name, database=(catalog, db), force=force) + def _in_memory_table_exists(self, name: str) -> bool: + sql = ( + sg.select(NULL) + .from_(sg.to_identifier("USER_OBJECTS", quoted=self.compiler.quoted)) + .where( + C.OBJECT_TYPE.eq(sge.convert("TABLE")), + C.OBJECT_NAME.eq(sge.convert(name)), + ) + .limit(sge.convert(1)) + .sql(self.dialect) + ) + with self.begin() as cur: + results = cur.execute(sql).fetchall() + return bool(results) + def _register_in_memory_table(self, op: ops.InMemoryTable) -> None: schema = op.schema