From bc5d7f72f813f9c47dac300f03ce7c2081232fde Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Tue, 20 Aug 2024 15:51:37 -0400 Subject: [PATCH] Enable reading string view by default from Parquet --- datafusion/common/src/config.rs | 10 +++++++--- .../sqllogictest/test_files/information_schema.slt | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 37d26c6f00c4..4c3353564651 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -487,9 +487,13 @@ config_namespace! { /// data frame. pub maximum_buffered_record_batches_per_stream: usize, default = 2 - /// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, - /// and `Binary/BinaryLarge` with `BinaryView`. - pub schema_force_string_view: bool, default = false + /// (reading) If true (the default), parquet reader will read text and + /// binary columns using Arrow byte view types. DataFusion has + /// specialized proceessing using the Arrow `Utf8View` type for columns + /// that could also be read as `Utf8/Utf8Large` and using the Arrow + /// `BinaryView` type for columns that could also be read as + /// `Binary/BinaryLarge`. + pub schema_force_string_view: bool, default = true } } diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index efd017a90bc4..c69ac79195ae 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -201,7 +201,7 @@ datafusion.execution.parquet.metadata_size_hint NULL datafusion.execution.parquet.pruning true datafusion.execution.parquet.pushdown_filters false datafusion.execution.parquet.reorder_filters false -datafusion.execution.parquet.schema_force_string_view false +datafusion.execution.parquet.schema_force_string_view true datafusion.execution.parquet.skip_metadata true datafusion.execution.parquet.statistics_enabled page datafusion.execution.parquet.write_batch_size 1024 @@ -291,7 +291,7 @@ datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization". datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query -datafusion.execution.parquet.schema_force_string_view false (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`. +datafusion.execution.parquet.schema_force_string_view true (reading) If true (the default), parquet reader will read text and binary columns using Arrow byte view types. DataFusion has specialized proceessing using the Arrow `Utf8View` type for columns that could also be read as `Utf8/Utf8Large` and using the Arrow `BinaryView` type for columns that could also be read as `Binary/BinaryLarge`. datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes