Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add additional test coverage for aggregaes using dates/times/timestamps/decimals #6939

Merged
merged 2 commits into from
Jul 12, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 194 additions & 38 deletions datafusion/core/tests/sqllogictests/test_files/aggregate.slt
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ WITH HEADER ROW
LOCATION '../../testing/data/csv/aggregate_test_100.csv'

statement ok
CREATE TABLE d_table (c1 decimal(10,3)) as values
(110.000), (110.001), (110.002), (110.003), (110.004), (110.005), (110.006), (110.007), (110.008), (110.009),
(-100.000),(-100.001),(-100.002),(-100.003),(-100.004),(-100.005),(-100.006),(-100.007),(-100.008),(-100.009)
CREATE TABLE d_table (c1 decimal(10,3), c2 varchar)
as values
(110.000, 'A'), (110.001, 'A'), (110.002, 'A'), (110.003, 'A'), (110.004, 'A'), (110.005, 'A'), (110.006, 'A'), (110.007, 'A'), (110.008, 'A'), (110.009, 'A'),
(-100.000, 'B'),(-100.001, 'B'),(-100.002, 'B'),(-100.003, 'B'),(-100.004, 'B'),(-100.005, 'B'),(-100.006, 'B'),(-100.007, 'B'),(-100.008, 'B'),(-100.009, 'B')

statement ok
CREATE TABLE median_table (
Expand Down Expand Up @@ -448,7 +449,7 @@ drop table cpu;

# this test is to show create table as and select into works in the same way
statement ok
SELECT * INTO cpu
SELECT * INTO cpu
FROM (VALUES
('host0', 90.1),
('host1', 90.2),
Expand Down Expand Up @@ -1483,22 +1484,6 @@ NULL 2
statement ok
drop table the_nulls;

# All supported timestamp types

# "nanos" --> TimestampNanosecondArray
# "micros" --> TimestampMicrosecondArray
# "millis" --> TimestampMillisecondArray
# "secs" --> TimestampSecondArray
# "names" --> StringArray

statement ok
create table t_source
as values
('2018-11-13T17:11:10.011375885995', 'Row 0'),
('2011-12-13T11:13:10.12345', 'Row 1'),
(null, 'Row 2'),
('2021-01-01T05:11:10.432', 'Row 3');

statement ok
create table bit_aggregate_functions (
c1 SMALLINT NOT NULL,
Expand Down Expand Up @@ -1568,62 +1553,196 @@ SELECT bool_or(distinct c1), bool_or(distinct c2), bool_or(distinct c3), bool_or
----
true true true false true true false NULL

# All supported timestamp types

# "nanos" --> TimestampNanosecondArray
# "micros" --> TimestampMicrosecondArray
# "millis" --> TimestampMillisecondArray
# "secs" --> TimestampSecondArray
# "names" --> StringArray

statement ok
create table t_source
as values
('2018-11-13T17:11:10.011375885995', 'Row 0', 'X'),
('2011-12-13T11:13:10.12345', 'Row 1', 'X'),
(null, 'Row 2', 'Y'),
('2021-01-01T05:11:10.432', 'Row 3', 'Y');

statement ok
create table t as
select
arrow_cast(column1, 'Timestamp(Nanosecond, None)') as nanos,
arrow_cast(column1, 'Timestamp(Microsecond, None)') as micros,
arrow_cast(column1, 'Timestamp(Millisecond, None)') as millis,
arrow_cast(column1, 'Timestamp(Second, None)') as secs,
column2 as names
column2 as names,
column3 as tag
from t_source;

# Demonstate the contents
query PPPPT
query PPPPTT
select * from t;
----
2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1
NULL NULL NULL NULL Row 2
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3
2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0 X
2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10 Row 1 X
NULL NULL NULL NULL Row 2 Y
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3 Y


# aggregate_timestamps_sum
statement error Error during planning: The function Sum does not support inputs of type Timestamp\(Nanosecond, None\)
statement error DataFusion error: Error during planning: The function Sum does not support inputs of type Timestamp\(Nanosecond, None\)\.
SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t;

statement error DataFusion error: Error during planning: The function Sum does not support inputs of type Timestamp\(Nanosecond, None\)\.
SELECT tag, sum(nanos), sum(micros), sum(millis), sum(secs) FROM t GROUP BY tag ORDER BY tag;

# aggregate_timestamps_count
query IIII
SELECT count(nanos), count(micros), count(millis), count(secs) FROM t;
----
3 3 3 3

query TIIII
SELECT tag, count(nanos), count(micros), count(millis), count(secs) FROM t GROUP BY tag ORDER BY tag;
----
X 2 2 2 2
Y 1 1 1 1

# aggregate_timestamps_min
query PPPP
SELECT min(nanos), min(micros), min(millis), min(secs) FROM t;
----
2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10

query TPPPP
SELECT tag, min(nanos), min(micros), min(millis), min(secs) FROM t GROUP BY tag ORDER BY tag;
----
X 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123 2011-12-13T11:13:10
Y 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10

# aggregate_timestamps_max
query PPPP
SELECT max(nanos), max(micros), max(millis), max(secs) FROM t;
----
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10

query TPPPP
SELECT tag, max(nanos), max(micros), max(millis), max(secs) FROM t GROUP BY tag ORDER BY tag
----
X 2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10
Y 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10


# aggregate_timestamps_avg
statement error Error during planning: The function Avg does not support inputs of type Timestamp\(Nanosecond, None\).
statement error DataFusion error: Error during planning: The function Avg does not support inputs of type Timestamp\(Nanosecond, None\)\.
SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t

statement error DataFusion error: Error during planning: The function Avg does not support inputs of type Timestamp\(Nanosecond, None\)\.
SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY tag ORDER BY tag;


statement ok
drop table t_source;

statement ok
drop table t;


# All supported Date tpes

# "date32" --> Date32Array
# "date64" --> Date64Array
# "names" --> StringArray

statement ok
create table t_source
as values
('2018-11-13', 'Row 0', 'X'),
('2011-12-13', 'Row 1', 'X'),
(null, 'Row 2', 'Y'),
('2021-01-01', 'Row 3', 'Y');

statement ok
create table t as
select
arrow_cast(column1, 'Date32') as date32,
-- Workaround https://github.com/apache/arrow-rs/issues/4512 is fixed, can use this
-- arrow_cast(column1, 'Date64') as date64,
arrow_cast(arrow_cast(column1, 'Date32'), 'Date64') as date64,
column2 as names,
column3 as tag
from t_source;

# Demonstate the contents
query DDTT
select * from t;
----
2018-11-13 2018-11-13T00:00:00 Row 0 X
2011-12-13 2011-12-13T00:00:00 Row 1 X
NULL NULL Row 2 Y
2021-01-01 2021-01-01T00:00:00 Row 3 Y


# aggregate_timestamps_sum
statement error DataFusion error: Error during planning: The function Sum does not support inputs of type Date32\.
SELECT sum(date32), sum(date64) FROM t;

statement error DataFusion error: Error during planning: The function Sum does not support inputs of type Date32\.
SELECT tag, sum(date32), sum(date64) FROM t GROUP BY tag ORDER BY tag;

# aggregate_timestamps_count
query II
SELECT count(date32), count(date64) FROM t;
----
3 3

query TII
SELECT tag, count(date32), count(date64) FROM t GROUP BY tag ORDER BY tag;
----
X 2 2
Y 1 1

# aggregate_timestamps_min
query DD
SELECT min(date32), min(date64) FROM t;
----
2011-12-13 2011-12-13T00:00:00

query TDD
SELECT tag, min(date32), min(date64) FROM t GROUP BY tag ORDER BY tag;
----
X 2011-12-13 2011-12-13T00:00:00
Y 2021-01-01 2021-01-01T00:00:00

# aggregate_timestamps_max
query DD
SELECT max(date32), max(date64) FROM t;
----
2021-01-01 2021-01-01T00:00:00

query TDD
SELECT tag, max(date32), max(date64) FROM t GROUP BY tag ORDER BY tag
----
X 2018-11-13 2018-11-13T00:00:00
Y 2021-01-01 2021-01-01T00:00:00


# aggregate_timestamps_avg
statement error DataFusion error: Error during planning: The function Avg does not support inputs of type Date32\.
SELECT avg(date32), avg(date64) FROM t

statement error DataFusion error: Error during planning: The function Avg does not support inputs of type Date32\.
SELECT tag, avg(date32), avg(date64) FROM t GROUP BY tag ORDER BY tag;


statement ok
drop table t_source;

statement ok
drop table t;


# All supported time types

# Columns are named:
Expand All @@ -1636,10 +1755,10 @@ drop table t;
statement ok
create table t_source
as values
('18:06:30.243620451', 'Row 0'),
('20:08:28.161121654', 'Row 1'),
('19:11:04.156423842', 'Row 2'),
('21:06:28.247821084', 'Row 3');
('18:06:30.243620451', 'Row 0', 'A'),
('20:08:28.161121654', 'Row 1', 'A'),
('19:11:04.156423842', 'Row 2', 'B'),
('21:06:28.247821084', 'Row 3', 'B');


statement ok
Expand All @@ -1649,46 +1768,71 @@ select
arrow_cast(column1, 'Time64(Microsecond)') as micros,
arrow_cast(column1, 'Time32(Millisecond)') as millis,
arrow_cast(column1, 'Time32(Second)') as secs,
column2 as names
column2 as names,
column3 as tag
from t_source;

# Demonstate the contents
query DDDDT
query DDDDTT
select * from t;
----
18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30 Row 0
20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28 Row 1
19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04 Row 2
21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28 Row 3
18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30 Row 0 A
20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28 Row 1 A
19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04 Row 2 B
21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28 Row 3 B

# aggregate_times_sum
statement error DataFusion error: Error during planning: The function Sum does not support inputs of type Time64\(Nanosecond\).
SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t

statement error DataFusion error: Error during planning: The function Sum does not support inputs of type Time64\(Nanosecond\)\.
SELECT tag, sum(nanos), sum(micros), sum(millis), sum(secs) FROM t GROUP BY tag ORDER BY tag

# aggregate_times_count
query IIII
SELECT count(nanos), count(micros), count(millis), count(secs) FROM t
----
4 4 4 4

query TIIII
SELECT tag, count(nanos), count(micros), count(millis), count(secs) FROM t GROUP BY tag ORDER BY tag
----
A 2 2 2 2
B 2 2 2 2


# aggregate_times_min
query DDDD
SELECT min(nanos), min(micros), min(millis), min(secs) FROM t
----
18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30

query TDDDD
SELECT tag, min(nanos), min(micros), min(millis), min(secs) FROM t GROUP BY tag ORDER BY tag
----
A 18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30
B 19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04

# aggregate_times_max
query DDDD
SELECT max(nanos), max(micros), max(millis), max(secs) FROM t
----
21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28

query TDDDD
SELECT tag, max(nanos), max(micros), max(millis), max(secs) FROM t GROUP BY tag ORDER BY tag
----
A 20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28
B 21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28


# aggregate_times_avg
statement error DataFusion error: Error during planning: The function Avg does not support inputs of type Time64\(Nanosecond\).
SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t

statement error DataFusion error: Error during planning: The function Avg does not support inputs of type Time64\(Nanosecond\)\.
SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY tag ORDER BY tag;

statement ok
drop table t_source;

Expand All @@ -1710,13 +1854,25 @@ select sum(c1), arrow_typeof(sum(c1)) from d_table;
----
100 Decimal128(20, 3)

query TRT
select c2, sum(c1), arrow_typeof(sum(c1)) from d_table GROUP BY c2 ORDER BY c2;
----
A 1100.045 Decimal128(20, 3)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not 100% sure whether the output type is correct, but I also don't know that it isn't so 👍

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cc @liukun4515 in case you have some thoughts (this PR doesn't change the type FWIW)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The data type of c1 is c1 decimal(10,3)
the result of sum(c1) is decimal(10+10,3), this align with spark rule: the new precision = old precision + 10

B -1000.045 Decimal128(20, 3)


# aggregate_decimal_avg
query RT
select avg(c1), arrow_typeof(avg(c1)) from d_table
----
5 Decimal128(14, 7)

query TRT
select c2, avg(c1), arrow_typeof(avg(c1)) from d_table GROUP BY c2 ORDER BY c2
----
A 110.0045 Decimal128(14, 7)
B -100.0045 Decimal128(14, 7)

# Use PostgresSQL dialect
statement ok
set datafusion.sql_parser.dialect = 'Postgres';
Expand Down