Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Dictionary String (UTF8) type to String sqllogictests #12621

Merged
merged 5 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use crate::engines::output::DFColumnType;
use arrow::array::Array;
use arrow::datatypes::Fields;
use arrow::util::display::ArrayFormatter;
use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch};
Expand All @@ -23,8 +25,6 @@ use datafusion_common::DataFusionError;
use std::path::PathBuf;
use std::sync::OnceLock;

use crate::engines::output::DFColumnType;

use super::super::conversion::*;
use super::error::{DFSqlLogicTestError, Result};

Expand Down Expand Up @@ -174,7 +174,7 @@ fn convert_batch(batch: RecordBatch) -> Result<Vec<Vec<String>>> {
batch
.columns()
.iter()
.map(|col| cell_to_string(col, row))
.map(|col| cell_to_string(col, row, col.data_type()))
.collect::<Result<Vec<String>>>()
})
.collect()
Expand All @@ -198,12 +198,16 @@ macro_rules! get_row_value {
///
/// Floating numbers are rounded to have a consistent representation with the Postgres runner.
///
pub fn cell_to_string(col: &ArrayRef, row: usize) -> Result<String> {
pub fn cell_to_string(
col: &ArrayRef,
row: usize,
data_type: &DataType,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor nit is that since this data type comes from col it seems like we could keep the signature of this function the same and call let data_type = col.data_type() in the function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops... my bad. It's my other experimental change. I forgot to roll back it. I'll recover it. Thanks for mentioning it.

) -> Result<String> {
if !col.is_valid(row) {
// represent any null value with the string "NULL"
Ok(NULL_STR.to_string())
} else {
match col.data_type() {
match data_type {
DataType::Null => Ok(NULL_STR.to_string()),
DataType::Boolean => {
Ok(bool_to_str(get_row_value!(array::BooleanArray, col, row)))
Expand Down Expand Up @@ -275,6 +279,17 @@ pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec<DFColumnType> {
| DataType::Time32(_)
| DataType::Time64(_) => DFColumnType::DateTime,
DataType::Timestamp(_, _) => DFColumnType::Timestamp,
DataType::Dictionary(key_type, value_type) => {
if key_type.is_integer() {
// mapping dictionary string types to Text
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

match value_type.as_ref() {
DataType::Utf8 | DataType::LargeUtf8 => DFColumnType::Text,
_ => DFColumnType::Another,
}
} else {
DFColumnType::Another
}
}
_ => DFColumnType::Another,
})
.collect()
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4546,7 +4546,7 @@ set datafusion.sql_parser.dialect = 'Generic';
statement ok
create table dict_test as values (1, arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (2, arrow_cast('bar', 'Dictionary(Int32, Utf8)'));

query I?
query IT
select * from dict_test;
----
1 foo
Expand Down
12 changes: 6 additions & 6 deletions datafusion/sqllogictest/test_files/coalesce.slt
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,13 @@ select
statement ok
create table test1 as values (arrow_cast('foo', 'Dictionary(Int32, Utf8)')), (null);

query ?
query T
select coalesce(column1, 'none_set') from test1;
----
foo
none_set

query ?
query T
select coalesce(null, column1, 'none_set') from test1;
----
foo
Expand All @@ -246,7 +246,7 @@ drop table test1
statement ok
create table t(c varchar) as values ('a'), (null);

query ?T
query TT
select
coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')),
arrow_typeof(coalesce(c, arrow_cast('b', 'Dictionary(Int32, Utf8)')))
Expand All @@ -264,7 +264,7 @@ create table t as values
(arrow_cast('foo', 'Dictionary(Int32, Utf8)')),
(null);

query ?T
query TT
select
coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)')),
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, LargeUtf8)')))
Expand All @@ -273,7 +273,7 @@ from t;
foo Dictionary(Int64, LargeUtf8)
bar Dictionary(Int64, LargeUtf8)

query ?T
query TT
select
coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)')),
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int32, LargeUtf8)')))
Expand All @@ -282,7 +282,7 @@ from t;
foo Dictionary(Int32, LargeUtf8)
bar Dictionary(Int32, LargeUtf8)

query ?T
query TT
select
coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)')),
arrow_typeof(coalesce(column1, arrow_cast('bar', 'Dictionary(Int64, Utf8)')))
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/copy.slt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ statement ok
CREATE EXTERNAL TABLE validate_partitioned_parquet STORED AS PARQUET
LOCATION 'test_files/scratch/copy/partitioned_table1/' PARTITIONED BY (col2);

query I?
query IT
select * from validate_partitioned_parquet order by col1, col2;
----
1 Foo
Expand Down Expand Up @@ -64,7 +64,7 @@ statement ok
CREATE EXTERNAL TABLE validate_partitioned_parquet2 STORED AS PARQUET
LOCATION 'test_files/scratch/copy/partitioned_table2/' PARTITIONED BY (column2, column3);

query I??
query ITT
select * from validate_partitioned_parquet2 order by column1,column2,column3;
----
1 a x
Expand Down Expand Up @@ -92,7 +92,7 @@ statement ok
CREATE EXTERNAL TABLE validate_partitioned_parquet3 STORED AS PARQUET
LOCATION 'test_files/scratch/copy/partitioned_table3/' PARTITIONED BY (column1, column3);

query ?T?
query TTT
select column1, column2, column3 from validate_partitioned_parquet3 order by column1,column2,column3;
----
1 a x
Expand Down Expand Up @@ -552,7 +552,7 @@ CREATE EXTERNAL TABLE validate_arrow_file_dict
STORED AS arrow
LOCATION 'test_files/scratch/copy/table_dict.arrow';

query T?
query TT
select * from validate_arrow_file_dict;
----
c foo
Expand Down
14 changes: 7 additions & 7 deletions datafusion/sqllogictest/test_files/dictionary.slt
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ FROM (
('1000', 32, 'foo', 'True', 10.0, 1703035800000000000)
);

query ?RTTRP
query TRTTRP
SELECT * FROM m1;
----
1000 32 foo True 1 2023-12-20T00:00:00
Expand Down Expand Up @@ -137,7 +137,7 @@ FROM (
('passive', '1000', 1000, 1701653400000000000)
);

query ??RP
query TTRP
SELECT * FROM m2;
----
active 1000 100 2023-12-04T00:00:00
Expand Down Expand Up @@ -208,7 +208,7 @@ true false NULL true true false true NULL

# Reproducer for https://github.com/apache/datafusion/issues/8738
# This query should work correctly
query P?TT rowsort
query PTTT rowsort
SELECT
"data"."timestamp" as "time",
"data"."tag_id",
Expand Down Expand Up @@ -264,7 +264,7 @@ ORDER BY


# deterministic sort (so we can avoid rowsort)
query P?TT
query PTTT
SELECT
"data"."timestamp" as "time",
"data"."tag_id",
Expand Down Expand Up @@ -348,7 +348,7 @@ create table m3 as
from m3_source;

# there are two values in column2
query T?I rowsort
query TTI rowsort
SELECT *
FROM m3;
----
Expand Down Expand Up @@ -397,7 +397,7 @@ create table test as values
;

# query using an string '1' which must be coerced into a dictionary string
query T?
query TT
SELECT * from test where column2 = '1';
----
row1 1
Expand Down Expand Up @@ -429,7 +429,7 @@ physical_plan


# Now query using an integer which must be coerced into a dictionary string
query T?
query TT
SELECT * from test where column2 = 1;
----
row1 1
Expand Down
16 changes: 8 additions & 8 deletions datafusion/sqllogictest/test_files/group_by.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4614,7 +4614,7 @@ CREATE TABLE int8_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(Int8, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM int8_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4652,7 +4652,7 @@ CREATE TABLE int16_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(Int16, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM int16_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4690,7 +4690,7 @@ CREATE TABLE int32_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(Int32, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM int32_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4728,7 +4728,7 @@ CREATE TABLE int64_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(Int64, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM int64_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4766,7 +4766,7 @@ CREATE TABLE uint8_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(UInt8, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM uint8_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4804,7 +4804,7 @@ CREATE TABLE uint16_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(UInt16, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM uint16_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4842,7 +4842,7 @@ CREATE TABLE uint32_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(UInt32, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM uint32_dict GROUP BY column2;
----
A 4
Expand Down Expand Up @@ -4880,7 +4880,7 @@ CREATE TABLE uint64_dict AS VALUES
(1, arrow_cast('A', 'Dictionary(UInt64, Utf8)'));

# Group by the non-dict column
query ?I rowsort
query TI rowsort
SELECT column2, count(column1) FROM uint64_dict GROUP BY column2;
----
A 4
Expand Down
12 changes: 6 additions & 6 deletions datafusion/sqllogictest/test_files/joins.slt
Original file line number Diff line number Diff line change
Expand Up @@ -2672,7 +2672,7 @@ logical_plan
05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]

# hash_join_with_date32
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c1 = t2.c1
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand All @@ -2691,7 +2691,7 @@ logical_plan
05)----TableScan: hashjoin_datatype_table_t2 projection=[c1, c2, c3, c4]

# hash_join_with_date64
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 left join hashjoin_datatype_table_t2 t2 on t1.c2 = t2.c2
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand All @@ -2712,7 +2712,7 @@ logical_plan
05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4]

# hash_join_with_decimal
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype_table_t1 t2 on t1.c3 = t2.c3
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 1.23 abc
Expand All @@ -2732,7 +2732,7 @@ logical_plan
05)----TableScan: hashjoin_datatype_table_t1 projection=[c1, c2, c3, c4]

# hash_join_with_dictionary
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c4 = t2.c4
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand Down Expand Up @@ -2783,7 +2783,7 @@ physical_plan
11)----------MemoryExec: partitions=1, partition_sizes=[1]

# sort_merge_join_on_date32 inner sort merge join on data type (Date32)
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 join hashjoin_datatype_table_t2 t2 on t1.c1 = t2.c1
----
1970-01-02 1970-01-02T00:00:00 1.23 abc 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand Down Expand Up @@ -2815,7 +2815,7 @@ physical_plan
13)------------MemoryExec: partitions=1, partition_sizes=[1]

# sort_merge_join_on_decimal right join on data type (Decimal)
query DDR?DDR? rowsort
query DDRTDDRT rowsort
select * from hashjoin_datatype_table_t1 t1 right join hashjoin_datatype_table_t2 t2 on t1.c3 = t2.c3
----
1970-01-04 NULL -123.12 jkl 1970-01-02 1970-01-02T00:00:00 -123.12 abc
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/regexp.slt
Original file line number Diff line number Diff line change
Expand Up @@ -478,30 +478,30 @@ create or replace table dict_table as
select arrow_cast(column1, 'Dictionary(Int32, Utf8)') as column1
from strings;

query ?
query T
select column1 from dict_table where column1 LIKE '%oo%';
----
FooBar
Foo
Foo
FooBar

query ?
query T
select column1 from dict_table where column1 NOT LIKE '%oo%';
----
Bar
Bar
Baz

query ?
query T
select column1 from dict_table where column1 ILIKE '%oO%';
----
FooBar
Foo
Foo
FooBar

query ?
query T
select column1 from dict_table where column1 NOT ILIKE '%oO%';
----
Bar
Expand Down
Loading