diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs index b6b583b9fbdb..a88ee9d08df0 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +use crate::engines::output::DFColumnType; +use arrow::array::Array; use arrow::datatypes::Fields; use arrow::util::display::ArrayFormatter; use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch}; @@ -23,8 +25,6 @@ use datafusion_common::DataFusionError; use std::path::PathBuf; use std::sync::OnceLock; -use crate::engines::output::DFColumnType; - use super::super::conversion::*; use super::error::{DFSqlLogicTestError, Result}; @@ -275,6 +275,17 @@ pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec { | DataType::Time32(_) | DataType::Time64(_) => DFColumnType::DateTime, DataType::Timestamp(_, _) => DFColumnType::Timestamp, + DataType::Dictionary(key_type, value_type) => { + if key_type.is_integer() { + // mapping dictionary string types to Text + match value_type.as_ref() { + DataType::Utf8 | DataType::LargeUtf8 => DFColumnType::Text, + _ => DFColumnType::Another, + } + } else { + DFColumnType::Another + } + } _ => DFColumnType::Another, }) .collect() diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 3ae109bc80a6..56756cb2010b 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -4546,7 +4546,7 @@ set datafusion.sql_parser.dialect = 'Generic'; statement ok create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)')); -query I? +query IT select * from dict_test; ---- 1 foo diff --git a/datafusion/sqllogictest/test_files/coalesce.slt b/datafusion/sqllogictest/test_files/coalesce.slt index 0e977666ccfd..97e77d0feb3d 100644 --- a/datafusion/sqllogictest/test_files/coalesce.slt +++ b/datafusion/sqllogictest/test_files/coalesce.slt @@ -220,13 +220,13 @@ select statement ok create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); -query ? +query T select coalesce(column1, 'none_set') from test1; ---- foo none_set -query ? +query T select coalesce(null, column1, 'none_set') from test1; ---- foo @@ -246,7 +246,7 @@ drop table test1 statement ok create table t(c varchar) as values ('a'), (null); -query ?T +query TT select coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')), arrow_typeof(coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)'))) @@ -264,7 +264,7 @@ create table t as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null); -query ?T +query TT select coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)')), arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)'))) @@ -273,7 +273,7 @@ from t; foo Dictionary(Int64, LargeUtf8) bar Dictionary(Int64, LargeUtf8) -query ?T +query TT select coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)')), arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)'))) @@ -282,7 +282,7 @@ from t; foo Dictionary(Int32, LargeUtf8) bar Dictionary(Int32, LargeUtf8) -query ?T +query TT select coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)')), arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)'))) diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 40d229f9716c..caa708483a11 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -36,7 +36,7 @@ statement ok CREATE EXTERNAL TABLE validate_partitioned_parquet STORED AS PARQUET LOCATION 'test_files/scratch/copy/partitioned_table1/' PARTITIONED BY (col2); -query I? +query IT select * from validate_partitioned_parquet order by col1, col2; ---- 1 Foo @@ -64,7 +64,7 @@ statement ok CREATE EXTERNAL TABLE validate_partitioned_parquet2 STORED AS PARQUET LOCATION 'test_files/scratch/copy/partitioned_table2/' PARTITIONED BY (column2, column3); -query I?? +query ITT select * from validate_partitioned_parquet2 order by column1,column2,column3; ---- 1 a x @@ -92,7 +92,7 @@ statement ok CREATE EXTERNAL TABLE validate_partitioned_parquet3 STORED AS PARQUET LOCATION 'test_files/scratch/copy/partitioned_table3/' PARTITIONED BY (column1, column3); -query ?T? +query TTT select column1, column2, column3 from validate_partitioned_parquet3 order by column1,column2,column3; ---- 1 a x @@ -552,7 +552,7 @@ CREATE EXTERNAL TABLE validate_arrow_file_dict STORED AS arrow LOCATION 'test_files/scratch/copy/table_dict.arrow'; -query T? +query TT select * from validate_arrow_file_dict; ---- c foo diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt index ec8a51488564..176331f570b0 100644 --- a/datafusion/sqllogictest/test_files/dictionary.slt +++ b/datafusion/sqllogictest/test_files/dictionary.slt @@ -62,7 +62,7 @@ FROM ( ('1000', 32, 'foo', 'True', 10.0, 1703035800000000000) ); -query ?RTTRP +query TRTTRP SELECT * FROM m1; ---- 1000 32 foo True 1 2023-12-20T00:00:00 @@ -137,7 +137,7 @@ FROM ( ('passive', '1000', 1000, 1701653400000000000) ); -query ??RP +query TTRP SELECT * FROM m2; ---- active 1000 100 2023-12-04T00:00:00 @@ -208,7 +208,7 @@ true false NULL true true false true NULL # Reproducer for https://github.com/apache/datafusion/issues/8738 # This query should work correctly -query P?TT rowsort +query PTTT rowsort SELECT "data"."timestamp" as "time", "data"."tag_id", @@ -264,7 +264,7 @@ ORDER BY # deterministic sort (so we can avoid rowsort) -query P?TT +query PTTT SELECT "data"."timestamp" as "time", "data"."tag_id", @@ -348,7 +348,7 @@ create table m3 as from m3_source; # there are two values in column2 -query T?I rowsort +query TTI rowsort SELECT * FROM m3; ---- @@ -397,7 +397,7 @@ create table test as values ; # query using an string '1' which must be coerced into a dictionary string -query T? +query TT SELECT * from test where column2 = '1'; ---- row1 1 @@ -429,7 +429,7 @@ physical_plan # Now query using an integer which must be coerced into a dictionary string -query T? +query TT SELECT * from test where column2 = 1; ---- row1 1 diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 86651f6ce43c..f561fa9e9ac8 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4614,7 +4614,7 @@ CREATE TABLE int8_dict AS VALUES (1, arrow_cast('A', 'Dictionary(Int8, Utf8)')); # Group by the non-dict column -query ?I rowsort +query TI rowsort SELECT column2, count(column1) FROM int8_dict GROUP BY column2; ---- A 4 @@ -4652,7 +4652,7 @@ CREATE TABLE int16_dict AS VALUES (1, arrow_cast('A', 'Dictionary(Int16, Utf8)')); # Group by the non-dict column -query ?I rowsort +query TI rowsort SELECT column2, count(column1) FROM int16_dict GROUP BY column2; ---- A 4 @@ -4690,7 +4690,7 @@ CREATE TABLE int32_dict AS VALUES (1, arrow_cast('A', 'Dictionary(Int32, Utf8)')); # Group by the non-dict column -query ?I rowsort +query TI rowsort SELECT column2, count(column1) FROM int32_dict GROUP BY column2; ---- A 4 @@ -4728,7 +4728,7 @@ CREATE TABLE int64_dict AS VALUES (1, arrow_cast('A', 'Dictionary(Int64, Utf8)')); # Group by the non-dict column -query ?I rowsort +query TI rowsort SELECT column2, count(column1) FROM int64_dict GROUP BY column2; ---- A 4 @@ -4766,7 +4766,7 @@ CREATE TABLE uint8_dict AS VALUES (1, arrow_cast('A', 'Dictionary(UInt8, Utf8)')); # Group by the non-dict column -query ?I rowsort +query TI rowsort SELECT column2, count(column1) FROM uint8_dict GROUP BY column2; ---- A 4 @@ -4804,7 +4804,7 @@ CREATE TABLE uint16_dict AS VALUES (1, arrow_cast('A', 'Dictionary(UInt16, Utf8)')); # Group by the non-dict column -query ?I rowsort +query TI rowsort SELECT column2, count(column1) FROM uint16_dict GROUP BY column2; ---- A 4 @@ -4842,7 +4842,7 @@ CREATE TABLE uint32_dict AS VALUES (1, arrow_cast('A', 'Dictionary(UInt32, Utf8)')); # Group by the non-dict column -query ?I rowsort +query TI rowsort SELECT column2, count(column1) FROM uint32_dict GROUP BY column2; ---- A 4 @@ -4880,7 +4880,7 @@ CREATE TABLE uint64_dict AS VALUES (1, arrow_cast('A', 'Dictionary(UInt64, Utf8)')); # Group by the non-dict column -query ?I rowsort +query TI rowsort SELECT column2, count(column1) FROM uint64_dict GROUP BY column2; ---- A 4 diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 91a9fe361f7a..a7a252cc20d7 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -2672,7 +2672,7 @@ logical_plan 05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] # hash_join_with_date32 -query DDR?DDR? rowsort +query DDRTDDRT rowsort select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c1 = t2.c1 ---- 1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc @@ -2691,7 +2691,7 @@ logical_plan 05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4] # hash_join_with_date64 -query DDR?DDR? rowsort +query DDRTDDRT rowsort select * from hashjoin_datatype_table_t1 t1 left join hashjoin_datatype_table_t2 t2 on t1.c2 = t2.c2 ---- 1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc @@ -2712,7 +2712,7 @@ logical_plan 05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] # hash_join_with_decimal -query DDR?DDR? rowsort +query DDRTDDRT rowsort select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype_table_t1 t2 on t1.c3 = t2.c3 ---- 1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 1.23 abc @@ -2732,7 +2732,7 @@ logical_plan 05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4] # hash_join_with_dictionary -query DDR?DDR? rowsort +query DDRTDDRT rowsort select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c4 = t2.c4 ---- 1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc @@ -2783,7 +2783,7 @@ physical_plan 11)----------MemoryExec: partitions=1, partition_sizes=[1] # sort_merge_join_on_date32 inner sort merge join on data type (Date32) -query DDR?DDR? rowsort +query DDRTDDRT rowsort select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c1 = t2.c1 ---- 1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc @@ -2815,7 +2815,7 @@ physical_plan 13)------------MemoryExec: partitions=1, partition_sizes=[1] # sort_merge_join_on_decimal right join on data type (Decimal) -query DDR?DDR? rowsort +query DDRTDDRT rowsort select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype_table_t2 t2 on t1.c3 = t2.c3 ---- 1970-01-04 NULL -123.12 jkl 1970-01-02 1970-01-02T00:00:00 -123.12 abc diff --git a/datafusion/sqllogictest/test_files/regexp.slt b/datafusion/sqllogictest/test_files/regexp.slt index 1685ed51afef..eedc3ddb6d59 100644 --- a/datafusion/sqllogictest/test_files/regexp.slt +++ b/datafusion/sqllogictest/test_files/regexp.slt @@ -478,7 +478,7 @@ create or replace table dict_table as select arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1 from strings; -query ? +query T select column1 from dict_table where column1 LIKE '%oo%'; ---- FooBar @@ -486,14 +486,14 @@ Foo Foo FooBar -query ? +query T select column1 from dict_table where column1 NOT LIKE '%oo%'; ---- Bar Bar Baz -query ? +query T select column1 from dict_table where column1 ILIKE '%oO%'; ---- FooBar @@ -501,7 +501,7 @@ Foo Foo FooBar -query ? +query T select column1 from dict_table where column1 NOT ILIKE '%oO%'; ---- Bar diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 6950c41f5405..3286ae32d4d8 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -67,7 +67,7 @@ AS SELECT arrow_cast(x, 'Dictionary(Int32, Utf8)') as d1, y as d2, arrow_cast(z, 'LargeUtf8') as d3 FROM window_null_string_value_prepare; -query ?I +query TI SELECT d1, row_number() OVER (partition by d1) as rn1 FROM window_null_string_table order by d1 asc; ---- one 1 @@ -241,7 +241,7 @@ AS SELECT arrow_cast(x, 'Dictionary(Int32, Utf8)') as d1, arrow_cast(y, 'Dictionary(Int32, Utf8)') as d2, z as d3 FROM value; -query ? +query T SELECT d1 FROM string_dictionary_table; ---- one @@ -249,38 +249,38 @@ NULL three # basic filtering -query ? +query T SELECT d1 FROM string_dictionary_table WHERE d1 IS NOT NULL; ---- one three # comparison with constant -query ? +query T SELECT d1 FROM string_dictionary_table WHERE d1 = 'three'; ---- three # comparison with another dictionary column -query ? +query T SELECT d1 FROM string_dictionary_table WHERE d1 = d2; ---- three # order comparison with another dictionary column -query ? +query T SELECT d1 FROM string_dictionary_table WHERE d1 <= d2; ---- three # comparison with a non dictionary column -query ? +query T SELECT d1 FROM string_dictionary_table WHERE d1 = d3; ---- three # filtering with constant -query ? +query T SELECT d1 FROM string_dictionary_table WHERE d1 = 'three'; ---- three @@ -320,7 +320,7 @@ SELECT MAX(d1) FROM string_dictionary_table; three # grouping -query ?I +query TI SELECT d1, COUNT(*) FROM string_dictionary_table group by d1 order by d1; ---- one 1 @@ -328,7 +328,7 @@ three 1 NULL 1 # window functions -query ?I +query TI SELECT d1, row_number() OVER (partition by d1) as rn1 FROM string_dictionary_table order by d1; ---- one 1 diff --git a/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt new file mode 100644 index 000000000000..9d2460816709 --- /dev/null +++ b/datafusion/sqllogictest/test_files/string/dictionary_utf8.slt @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +include ./init_data.slt.part + +# -------------------------------------- +# Setup test tables with different physical string types +# and repeat tests in `string_query.slt.part` +# -------------------------------------- +statement ok +create table test_basic_operator as +select + arrow_cast(column1, 'Dictionary(Int32, Utf8)') as ascii_1, + arrow_cast(column2, 'Dictionary(Int32, Utf8)') as ascii_2, + arrow_cast(column3, 'Dictionary(Int32, Utf8)') as unicode_1, + arrow_cast(column4, 'Dictionary(Int32, Utf8)') as unicode_2 +from test_source; + +statement ok +create table test_substr as +select arrow_cast(col1, 'Dictionary(Int32, Utf8)') as c1 from test_substr_base; + +statement ok +drop table test_source + +# +# common test for string-like functions and operators +# +include ./string_query.slt.part + +# +# Clean up +# +statement ok +drop table test_basic_operator; + +statement ok +drop table test_substr_base; diff --git a/datafusion/sqllogictest/test_files/string/large_string.slt b/datafusion/sqllogictest/test_files/string/large_string.slt index d90f2bffe0f5..a2e570073ff6 100644 --- a/datafusion/sqllogictest/test_files/string/large_string.slt +++ b/datafusion/sqllogictest/test_files/string/large_string.slt @@ -34,6 +34,28 @@ statement ok create table test_substr as select arrow_cast(col1, 'LargeUtf8') as c1 from test_substr_base; +# select +query TTTT +SELECT ascii_1, ascii_2, unicode_1, unicode_2 FROM test_basic_operator +---- +Andrew X datafusion📊🔥 🔥 +Xiangpeng Xiangpeng datafusion数据融合 datafusion数据融合 +Raphael R datafusionДатаФусион аФус +NULL R NULL 🔥 + +# TODO: move it back to `string_query.slt.part` after fixing the issue +# https://github.com/apache/datafusion/issues/12618 +query BB +SELECT + ascii_1 ~* '^a.{3}e', + unicode_1 ~* '^d.*Фу' +FROM test_basic_operator; +---- +true false +false false +false true +NULL NULL + # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string.slt b/datafusion/sqllogictest/test_files/string/string.slt index 435795309f52..bc923d5e12c3 100644 --- a/datafusion/sqllogictest/test_files/string/string.slt +++ b/datafusion/sqllogictest/test_files/string/string.slt @@ -34,6 +34,19 @@ statement ok create table test_substr as select arrow_cast(col1, 'Utf8') as c1 from test_substr_base; +# TODO: move it back to `string_query.slt.part` after fixing the issue +# https://github.com/apache/datafusion/issues/12618 +query BB +SELECT + ascii_1 ~* '^a.{3}e', + unicode_1 ~* '^d.*Фу' +FROM test_basic_operator; +---- +true false +false false +false true +NULL NULL + # # common test for string-like functions and operators # diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index f22edae01de9..96d5ddbd992c 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -642,16 +642,18 @@ true false false true NULL NULL -query BB -SELECT - ascii_1 ~* '^a.{3}e', - unicode_1 ~* '^d.*Фу' -FROM test_basic_operator; ----- -true false -false false -false true -NULL NULL +# TODO: DictionaryString does not support ~* operator. Enable this after fixing the issue +# see issue: https://github.com/apache/datafusion/issues/12618 +#query BB +#SELECT +# ascii_1 ~* '^a.{3}e', +# unicode_1 ~* '^d.*Фу' +#FROM test_basic_operator; +#---- +#true false +#false false +#false true +#NULL NULL query BB SELECT diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index da7d99cc311b..e7b55c9c1c8c 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -37,6 +37,19 @@ select arrow_cast(col1, 'Utf8View') as c1 from test_substr_base; statement ok drop table test_source +# TODO: move it back to `string_query.slt.part` after fixing the issue +# https://github.com/apache/datafusion/issues/12618 +query BB +SELECT + ascii_1 ~* '^a.{3}e', + unicode_1 ~* '^d.*Фу' +FROM test_basic_operator; +---- +true false +false false +false true +NULL NULL + # # common test for string-like functions and operators # @@ -969,34 +982,5 @@ logical_plan 01)Projection: temp.column2 || temp.column3 02)--TableScan: temp projection=[column2, column3] -################################################ -# Test for Dictionary String concatenation -################################################ - -# || same type (column1 has null, so also tests NULL || NULL) -# expect all results to be the same for each row as they all have the same values -query T -SELECT - column1_dict || column1_dict -FROM test; ----- -AndrewAndrew -XiangpengXiangpeng -RaphaelRaphael -NULL - -# || constants -# expect all results to be the same for each row as they all have the same values -query TT -SELECT - column1_dict || 'foo', - 'foo' || column1_dict -FROM test; ----- -Andrewfoo fooAndrew -Xiangpengfoo fooXiangpeng -Raphaelfoo fooRaphael -NULL NULL - statement ok drop table test diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt index 53f4ffe4edf5..1dbce79e0f1a 100644 --- a/datafusion/sqllogictest/test_files/topk.slt +++ b/datafusion/sqllogictest/test_files/topk.slt @@ -220,7 +220,7 @@ a 1 -5 12636 794623392 2909750622865366631 15 24022 2669374863 47766797847015095 statement ok create table dict as select c1, c2, c3, c13, arrow_cast(c13, 'Dictionary(Int32, Utf8)') as c13_dict from aggregate_test_100; -query TIIT? +query TIITT select * from dict order by c13 desc limit 5; ---- a 4 -38 ydkwycaISlYSlEq3TlkS2m15I2pcp8 ydkwycaISlYSlEq3TlkS2m15I2pcp8