Skip to content

Commit

Permalink
Add Dictionary String (UTF8) type to String sqllogictests (apache#12621)
Browse files Browse the repository at this point in the history
* mapping DictionaryString to text

* disable and move out the fail case for dictionary string

* fix the schema for dictionary string

* rollback the unnecessary change

* cargo fmt
  • Loading branch information
goldmedal authored and bgjackma committed Sep 25, 2024
1 parent 2a4574f commit ca438e9
Show file tree
Hide file tree
Showing 15 changed files with 172 additions and 88 deletions.
15 changes: 13 additions & 2 deletions datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use crate::engines::output::DFColumnType;
use arrow::array::Array;
use arrow::datatypes::Fields;
use arrow::util::display::ArrayFormatter;
use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch};
Expand All @@ -23,8 +25,6 @@ use datafusion_common::DataFusionError;
use std::path::PathBuf;
use std::sync::OnceLock;

use crate::engines::output::DFColumnType;

use super::super::conversion::*;
use super::error::{DFSqlLogicTestError, Result};

Expand Down Expand Up @@ -275,6 +275,17 @@ pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec<DFColumnType> {
| DataType::Time32(_)
| DataType::Time64(_) => DFColumnType::DateTime,
DataType::Timestamp(_, _) => DFColumnType::Timestamp,
DataType::Dictionary(key_type, value_type) => {
if key_type.is_integer() {
// mapping dictionary string types to Text
match value_type.as_ref() {
DataType::Utf8 | DataType::LargeUtf8 => DFColumnType::Text,
_ => DFColumnType::Another,
}
} else {
DFColumnType::Another
}
}
_ => DFColumnType::Another,
})
.collect()
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4546,7 +4546,7 @@ set datafusion.sql_parser.dialect = 'Generic';
statement ok
create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)'));

query I?
query IT
select * from dict_test;
----
1 foo
Expand Down
12 changes: 6 additions & 6 deletions datafusion/sqllogictest/test_files/coalesce.slt
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,13 @@ select
statement ok
create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null);

query ?
query T
select coalesce(column1, 'none_set') from test1;
----
foo
none_set

query ?
query T
select coalesce(null, column1, 'none_set') from test1;
----
foo
Expand All @@ -246,7 +246,7 @@ drop table test1
statement ok
create table t(c varchar) as values ('a'), (null);

query ?T
query TT
select
coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')),
arrow_typeof(coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')))
Expand All @@ -264,7 +264,7 @@ create table t as values
(arrow_cast('foo', 'Dictionary(Int32, Utf8)')),
(null);

query ?T
query TT
select
coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)')),
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)')))
Expand All @@ -273,7 +273,7 @@ from t;
foo Dictionary(Int64, LargeUtf8)
bar Dictionary(Int64, LargeUtf8)

query ?T
query TT
select
coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)')),
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)')))
Expand All @@ -282,7 +282,7 @@ from t;
foo Dictionary(Int32, LargeUtf8)
bar Dictionary(Int32, LargeUtf8)

query ?T
query TT
select
coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)')),
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)')))
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/copy.slt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ statement ok
CREATE EXTERNAL TABLE validate_partitioned_parquet STORED AS PARQUET
LOCATION 'test_files/scratch/copy/partitioned_table1/' PARTITIONED BY (col2);

query I?
query IT
select * from validate_partitioned_parquet order by col1, col2;
----
1 Foo
Expand Down Expand Up @@ -64,7 +64,7 @@ statement ok
CREATE EXTERNAL TABLE validate_partitioned_parquet2 STORED AS PARQUET
LOCATION 'test_files/scratch/copy/partitioned_table2/' PARTITIONED BY (column2, column3);

query I??
query ITT
select * from validate_partitioned_parquet2 order by column1,column2,column3;
----
1 a x
Expand Down Expand Up @@ -92,7 +92,7 @@ statement ok
CREATE EXTERNAL TABLE validate_partitioned_parquet3 STORED AS PARQUET
LOCATION 'test_files/scratch/copy/partitioned_table3/' PARTITIONED BY (column1, column3);

query ?T?
query TTT
select column1, column2, column3 from validate_partitioned_parquet3 order by column1,column2,column3;
----
1 a x
Expand Down Expand Up @@ -552,7 +552,7 @@ CREATE EXTERNAL TABLE validate_arrow_file_dict
STORED AS arrow
LOCATION 'test_files/scratch/copy/table_dict.arrow';

query T?
query TT
select * from validate_arrow_file_dict;
----
c foo
Expand Down
14 changes: 7 additions & 7 deletions datafusion/sqllogictest/test_files/dictionary.slt
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ FROM (
('1000', 32, 'foo', 'True', 10.0, 1703035800000000000)
);

query ?RTTRP
query TRTTRP
SELECT * FROM m1;
----
1000 32 foo True 1 2023-12-20T00:00:00
Expand Down Expand Up @@ -137,7 +137,7 @@ FROM (
('passive', '1000', 1000, 1701653400000000000)
);

query ??RP
query TTRP
SELECT * FROM m2;
----
active 1000 100 2023-12-04T00:00:00
Expand Down Expand Up @@ -208,7 +208,7 @@ true false NULL true true false true NULL

# Reproducer for https://github.com/apache/datafusion/issues/8738
# This query should work correctly
query P?TT rowsort
query PTTT rowsort
SELECT
"data"."timestamp" as "time",
"data"."tag_id",
Expand Down Expand Up @@ -264,7 +264,7 @@ ORDER BY


# deterministic sort (so we can avoid rowsort)
query P?TT
query PTTT
SELECT
"data"."timestamp" as "time",
"data"."tag_id",
Expand Down Expand Up @@ -348,7 +348,7 @@ create table m3 as
from m3_source;

# there are two values in column2
query T?I rowsort
query TTI rowsort
SELECT *
FROM m3;
----
Expand Down Expand Up @@ -397,7 +397,7 @@ create table test as values
;

# query using an string '1' which must be coerced into a dictionary string
query T?
query TT
SELECT * from test where column2 = '1';
----
row1 1
Expand Down Expand Up @@ -429,7 +429,7 @@ physical_plan


# Now query using an integer which must be coerced into a dictionary string
query T?
query TT
SELECT * from test where column2 = 1;
----
row1 1
Expand Down
16 changes: 8 additions & 8 deletions datafusion/sqllogictest/test_files/group_by.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4614,7 +4614,7 @@ CREATE TABLE int8_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(Int8, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM int8_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4652,7 +4652,7 @@ CREATE TABLE int16_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(Int16, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM int16_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4690,7 +4690,7 @@ CREATE TABLE int32_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(Int32, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM int32_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4728,7 +4728,7 @@ CREATE TABLE int64_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(Int64, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM int64_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4766,7 +4766,7 @@ CREATE TABLE uint8_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(UInt8, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM uint8_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4804,7 +4804,7 @@ CREATE TABLE uint16_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(UInt16, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM uint16_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4842,7 +4842,7 @@ CREATE TABLE uint32_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(UInt32, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM uint32_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4880,7 +4880,7 @@ CREATE TABLE uint64_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(UInt64, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM uint64_dict GROUP BY column2;
----
A 4
Expand Down
12 changes: 6 additions & 6 deletions datafusion/sqllogictest/test_files/joins.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2672,7 +2672,7 @@ logical_plan
05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]

# hash_join_with_date32
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c1 = t2.c1
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand All @@ -2691,7 +2691,7 @@ logical_plan
05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]

# hash_join_with_date64
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 left join hashjoin_datatype_table_t2 t2 on t1.c2 = t2.c2
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand All @@ -2712,7 +2712,7 @@ logical_plan
05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4]

# hash_join_with_decimal
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype_table_t1 t2 on t1.c3 = t2.c3
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 1.23 abc
Expand All @@ -2732,7 +2732,7 @@ logical_plan
05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4]

# hash_join_with_dictionary
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c4 = t2.c4
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand Down Expand Up @@ -2783,7 +2783,7 @@ physical_plan
11)----------MemoryExec: partitions=1, partition_sizes=[1]

# sort_merge_join_on_date32 inner sort merge join on data type (Date32)
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c1 = t2.c1
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand Down Expand Up @@ -2815,7 +2815,7 @@ physical_plan
13)------------MemoryExec: partitions=1, partition_sizes=[1]

# sort_merge_join_on_decimal right join on data type (Decimal)
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype_table_t2 t2 on t1.c3 = t2.c3
----
1970-01-04 NULL -123.12 jkl 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/regexp.slt
Original file line number Diff line number Diff line change
Expand Up @@ -478,30 +478,30 @@ create or replace table dict_table as
select arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1
from strings;

query ?
query T
select column1 from dict_table where column1 LIKE '%oo%';
----
FooBar
Foo
Foo
FooBar

query ?
query T
select column1 from dict_table where column1 NOT LIKE '%oo%';
----
Bar
Bar
Baz

query ?
query T
select column1 from dict_table where column1 ILIKE '%oO%';
----
FooBar
Foo
Foo
FooBar

query ?
query T
select column1 from dict_table where column1 NOT ILIKE '%oO%';
----
Bar
Expand Down
Loading

0 comments on commit ca438e9

Please sign in to comment.