From 8dc4c96075f20240935f298c6e909f76dcb9a8c3 Mon Sep 17 00:00:00 2001 From: Yevhenii Melnyk Date: Mon, 23 Jan 2023 12:36:23 +0100 Subject: [PATCH] Sqllogictest: use the same normalization for all tests (#5013) * Sqllogictest: use the same normalization for all tests * trigger ci --- .../src/engines/datafusion/mod.rs | 24 +--- .../src/engines/datafusion/normalize.rs | 72 ++++------- .../core/tests/sqllogictests/src/main.rs | 81 +++++------- .../sqllogictests/test_files/aggregate.slt | 120 +++++++++--------- .../tests/sqllogictests/test_files/ddl.slt | 2 +- .../tests/sqllogictests/test_files/tpch.slt | 10 +- 6 files changed, 126 insertions(+), 183 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/src/engines/datafusion/mod.rs b/datafusion/core/tests/sqllogictests/src/engines/datafusion/mod.rs index e1fffd669ccb..4acdcae7639e 100644 --- a/datafusion/core/tests/sqllogictests/src/engines/datafusion/mod.rs +++ b/datafusion/core/tests/sqllogictests/src/engines/datafusion/mod.rs @@ -34,20 +34,11 @@ mod normalize; pub struct DataFusion { ctx: SessionContext, file_name: String, - is_pg_compatibility_test: bool, } impl DataFusion { - pub fn new( - ctx: SessionContext, - file_name: String, - postgres_compatible: bool, - ) -> Self { - Self { - ctx, - file_name, - is_pg_compatibility_test: postgres_compatible, - } + pub fn new(ctx: SessionContext, file_name: String) -> Self { + Self { ctx, file_name } } } @@ -57,7 +48,7 @@ impl sqllogictest::AsyncDB for DataFusion { async fn run(&mut self, sql: &str) -> Result { println!("[{}] Running query: \"{}\"", self.file_name, sql); - let result = run_query(&self.ctx, sql, self.is_pg_compatibility_test).await?; + let result = run_query(&self.ctx, sql).await?; Ok(result) } @@ -76,11 +67,7 @@ impl sqllogictest::AsyncDB for DataFusion { } } -async fn run_query( - ctx: &SessionContext, - sql: impl Into, - is_pg_compatibility_test: bool, -) -> Result { +async fn run_query(ctx: &SessionContext, sql: impl Into) -> Result { let sql = sql.into(); // Check if the sql is `insert` if let Ok(mut statements) = DFParser::parse_sql(&sql) { @@ -94,7 +81,6 @@ async fn run_query( } let df = ctx.sql(sql.as_str()).await?; let results: Vec = df.collect().await?; - let formatted_batches = - normalize::convert_batches(results, is_pg_compatibility_test)?; + let formatted_batches = normalize::convert_batches(results)?; Ok(formatted_batches) } diff --git a/datafusion/core/tests/sqllogictests/src/engines/datafusion/normalize.rs b/datafusion/core/tests/sqllogictests/src/engines/datafusion/normalize.rs index 5733b1c32d11..148b4ecf7b88 100644 --- a/datafusion/core/tests/sqllogictests/src/engines/datafusion/normalize.rs +++ b/datafusion/core/tests/sqllogictests/src/engines/datafusion/normalize.rs @@ -26,10 +26,7 @@ use super::error::{DFSqlLogicTestError, Result}; /// /// Assumes empty record batches are a successful statement completion /// -pub fn convert_batches( - batches: Vec, - is_pg_compatibility_test: bool, -) -> Result { +pub fn convert_batches(batches: Vec) -> Result { if batches.is_empty() { // DataFusion doesn't report number of rows complete return Ok(DBOutput::StatementComplete(0)); @@ -53,23 +50,20 @@ pub fn convert_batches( ), ))); } - rows.append(&mut convert_batch(batch, is_pg_compatibility_test)?); + rows.append(&mut convert_batch(batch)?); } Ok(DBOutput::Rows { types, rows }) } /// Convert a single batch to a `Vec>` for comparison -fn convert_batch( - batch: RecordBatch, - is_pg_compatibility_test: bool, -) -> Result>> { +fn convert_batch(batch: RecordBatch) -> Result>> { (0..batch.num_rows()) .map(|row| { batch .columns() .iter() - .map(|col| cell_to_string(col, row, is_pg_compatibility_test)) + .map(|col| cell_to_string(col, row)) .collect::>>() }) .collect() @@ -93,18 +87,29 @@ macro_rules! get_row_value { /// /// Floating numbers are rounded to have a consistent representation with the Postgres runner. /// -pub fn cell_to_string( - col: &ArrayRef, - row: usize, - is_pg_compatibility_test: bool, -) -> Result { +pub fn cell_to_string(col: &ArrayRef, row: usize) -> Result { if !col.is_valid(row) { // represent any null value with the string "NULL" Ok(NULL_STR.to_string()) - } else if is_pg_compatibility_test { - postgres_compatible_cell_to_string(col, row) } else { match col.data_type() { + DataType::Boolean => { + Ok(bool_to_str(get_row_value!(array::BooleanArray, col, row))) + } + DataType::Float16 => { + Ok(f16_to_str(get_row_value!(array::Float16Array, col, row))) + } + DataType::Float32 => { + Ok(f32_to_str(get_row_value!(array::Float32Array, col, row))) + } + DataType::Float64 => { + Ok(f64_to_str(get_row_value!(array::Float64Array, col, row))) + } + DataType::Decimal128(_, scale) => { + let value = get_row_value!(array::Decimal128Array, col, row); + let decimal_scale = u32::try_from((*scale).max(0)).unwrap(); + Ok(i128_to_str(value, decimal_scale)) + } DataType::LargeUtf8 => Ok(varchar_to_str(get_row_value!( array::LargeStringArray, col, @@ -118,36 +123,3 @@ pub fn cell_to_string( .map_err(DFSqlLogicTestError::Arrow) } } - -/// Convert values to text representation that are the same as in Postgres client implementation. -fn postgres_compatible_cell_to_string(col: &ArrayRef, row: usize) -> Result { - match col.data_type() { - DataType::Boolean => { - Ok(bool_to_str(get_row_value!(array::BooleanArray, col, row))) - } - DataType::Float16 => { - Ok(f16_to_str(get_row_value!(array::Float16Array, col, row))) - } - DataType::Float32 => { - Ok(f32_to_str(get_row_value!(array::Float32Array, col, row))) - } - DataType::Float64 => { - Ok(f64_to_str(get_row_value!(array::Float64Array, col, row))) - } - DataType::Decimal128(_, scale) => { - let value = get_row_value!(array::Decimal128Array, col, row); - let decimal_scale = u32::try_from((*scale).max(0)).unwrap(); - Ok(i128_to_str(value, decimal_scale)) - } - DataType::LargeUtf8 => Ok(varchar_to_str(get_row_value!( - array::LargeStringArray, - col, - row - ))), - DataType::Utf8 => { - Ok(varchar_to_str(get_row_value!(array::StringArray, col, row))) - } - _ => arrow::util::display::array_value_to_string(col, row), - } - .map_err(DFSqlLogicTestError::Arrow) -} diff --git a/datafusion/core/tests/sqllogictests/src/main.rs b/datafusion/core/tests/sqllogictests/src/main.rs index 467ad5717c8f..b15f24c6bc3c 100644 --- a/datafusion/core/tests/sqllogictests/src/main.rs +++ b/datafusion/core/tests/sqllogictests/src/main.rs @@ -18,7 +18,7 @@ use std::error::Error; use std::path::{Path, PathBuf}; -use log::{debug, info}; +use log::info; use testcontainers::clients::Cli as Docker; use datafusion::prelude::SessionContext; @@ -56,36 +56,23 @@ pub async fn main() -> Result<(), Box> { for path in files { let file_name = path.file_name().unwrap().to_str().unwrap().to_string(); - let is_pg_compatibility_test = file_name.starts_with(PG_COMPAT_FILE_PREFIX); if options.complete_mode { - run_complete_file(&path, file_name, is_pg_compatibility_test).await?; + run_complete_file(&path, file_name).await?; } else if options.postgres_runner { - if is_pg_compatibility_test { - run_test_file_with_postgres(&path, file_name).await?; - } else { - debug!("Skipping test file {:?}", path); - } + run_test_file_with_postgres(&path, file_name).await?; } else { - run_test_file(&path, file_name, is_pg_compatibility_test).await?; + run_test_file(&path, file_name).await?; } } Ok(()) } -async fn run_test_file( - path: &PathBuf, - file_name: String, - is_pg_compatibility_test: bool, -) -> Result<(), Box> { +async fn run_test_file(path: &PathBuf, file_name: String) -> Result<(), Box> { println!("Running with DataFusion runner: {}", path.display()); - let ctx = context_for_test_file(&file_name, is_pg_compatibility_test).await; - let mut runner = sqllogictest::Runner::new(DataFusion::new( - ctx, - file_name, - is_pg_compatibility_test, - )); + let ctx = context_for_test_file(&file_name).await; + let mut runner = sqllogictest::Runner::new(DataFusion::new(ctx, file_name)); runner.run_file_async(path).await?; Ok(()) } @@ -117,18 +104,13 @@ async fn run_test_file_with_postgres( async fn run_complete_file( path: &PathBuf, file_name: String, - is_pg_compatibility_test: bool, ) -> Result<(), Box> { use sqllogictest::{default_validator, update_test_file}; info!("Using complete mode to complete: {}", path.display()); - let ctx = context_for_test_file(&file_name, is_pg_compatibility_test).await; - let mut runner = sqllogictest::Runner::new(DataFusion::new( - ctx, - file_name, - is_pg_compatibility_test, - )); + let ctx = context_for_test_file(&file_name).await; + let mut runner = sqllogictest::Runner::new(DataFusion::new(ctx, file_name)); info!("Using complete mode to complete {}", path.display()); let col_separator = " "; @@ -145,31 +127,28 @@ fn read_test_files(options: &Options) -> Vec { .unwrap() .map(|path| path.unwrap().path()) .filter(|path| options.check_test_file(path.as_path())) + .filter(|path| options.check_pg_compat_file(path.as_path())) .collect() } /// Create a SessionContext, configured for the specific test -async fn context_for_test_file( - file_name: &str, - is_pg_compatibility_test: bool, -) -> SessionContext { - if is_pg_compatibility_test { - info!("Registering pg compatibility tables"); - let ctx = SessionContext::new(); - setup::register_aggregate_csv_by_sql(&ctx).await; - ctx - } else { - match file_name { - "aggregate.slt" | "select.slt" => { - info!("Registering aggregate tables"); - let ctx = SessionContext::new(); - setup::register_aggregate_tables(&ctx).await; - ctx - } - _ => { - info!("Using default SessionContext"); - SessionContext::new() - } +async fn context_for_test_file(file_name: &str) -> SessionContext { + match file_name { + "aggregate.slt" | "select.slt" => { + info!("Registering aggregate tables"); + let ctx = SessionContext::new(); + setup::register_aggregate_tables(&ctx).await; + ctx + } + _ if file_name.starts_with(PG_COMPAT_FILE_PREFIX) => { + info!("Registering pg compatibility tables"); + let ctx = SessionContext::new(); + setup::register_aggregate_csv_by_sql(&ctx).await; + ctx + } + _ => { + info!("Using default SessionContext"); + SessionContext::new() } } } @@ -233,4 +212,10 @@ impl Options { let path_str = path.to_string_lossy(); self.filters.iter().any(|filter| path_str.contains(filter)) } + + /// Postgres runner executes only tests in files with specific names + fn check_pg_compat_file(&self, path: &Path) -> bool { + let file_name = path.file_name().unwrap().to_str().unwrap().to_string(); + !self.postgres_runner || file_name.starts_with(PG_COMPAT_FILE_PREFIX) + } } diff --git a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt index 5dccb2427f3e..ea6b359fbacc 100644 --- a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt +++ b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt @@ -22,19 +22,19 @@ query R SELECT avg(c12) FROM aggregate_test_100 ---- -0.5089725099127211 +0.508972509913 # csv_query_covariance_1 query R SELECT covar_pop(c2, c12) FROM aggregate_test_100 ---- --0.07916932235380847 +-0.079169322354 # csv_query_covariance_2 query R SELECT covar(c2, c12) FROM aggregate_test_100 ---- --0.07996901247859442 +-0.079969012479 # single_row_query_covar_1 query R @@ -78,13 +78,13 @@ with data as ( select covar_samp(f, b), covar_pop(f, b) from data ---- -1 0.6666666666666666 +1 0.666666666667 # csv_query_correlation query R SELECT corr(c2, c12) FROM aggregate_test_100 ---- --0.19064544190576607 +-0.190645441906 # single_row_query_correlation query R @@ -140,25 +140,25 @@ SELECT var_pop(c6) FROM aggregate_test_100 query R SELECT var_pop(c12) FROM aggregate_test_100 ---- -0.09234223721582163 +0.092342237216 # csv_query_variance_4 query R SELECT var(c2) FROM aggregate_test_100 ---- -1.8863636363636365 +1.886363636364 # csv_query_variance_5 query R SELECT var_samp(c2) FROM aggregate_test_100 ---- -1.8863636363636365 +1.886363636364 # csv_query_stddev_1 query R SELECT stddev_pop(c2) FROM aggregate_test_100 ---- -1.3665650368716449 +1.366565036872 # csv_query_stddev_2 query R @@ -170,25 +170,25 @@ SELECT stddev_pop(c6) FROM aggregate_test_100 query R SELECT stddev_pop(c12) FROM aggregate_test_100 ---- -0.30387865541334363 +0.303878655413 # csv_query_stddev_4 query R SELECT stddev(c12) FROM aggregate_test_100 ---- -0.3054095399405338 +0.305409539941 # csv_query_stddev_5 query R SELECT stddev_samp(c12) FROM aggregate_test_100 ---- -0.3054095399405338 +0.305409539941 # csv_query_stddev_6 query R select stddev(sq.column1) from (values (1.1), (2.0), (3.0)) as sq ---- -0.9504384952922168 +0.950438495292 # csv_query_approx_median_1 query I @@ -206,7 +206,7 @@ SELECT approx_median(c6) FROM aggregate_test_100 query R SELECT approx_median(c12) FROM aggregate_test_100 ---- -0.5550065410522981 +0.555006541052 # csv_query_median_1 query I @@ -224,7 +224,7 @@ SELECT median(c6) FROM aggregate_test_100 query R SELECT median(c12) FROM aggregate_test_100 ---- -0.5513900544385053 +0.551390054439 # median_i8 query I @@ -606,40 +606,40 @@ query TIR SELECT c1, c2, AVG(c3) FROM aggregate_test_100_by_sql GROUP BY CUBE (c1, c2) ORDER BY c1, c2 ---- a 1 -17.6 -a 2 -15.333333333333334 +a 2 -15.333333333333 a 3 -4.5 a 4 -32 a 5 -32 -a NULL -18.333333333333332 -b 1 31.666666666666668 +a NULL -18.333333333333 +b 1 31.666666666667 b 2 25.5 b 3 -42 b 4 -44.6 b 5 -0.2 -b NULL -5.842105263157895 +b NULL -5.842105263158 c 1 47.5 -c 2 -55.57142857142857 +c 2 -55.571428571429 c 3 47.5 c 4 -10.75 c 5 12 -c NULL -1.3333333333333333 -d 1 -8.142857142857142 -d 2 109.33333333333333 -d 3 41.333333333333336 +c NULL -1.333333333333 +d 1 -8.142857142857 +d 2 109.333333333333 +d 3 41.333333333333 d 4 54 d 5 -49.5 -d NULL 25.444444444444443 -e 1 75.66666666666667 +d NULL 25.444444444444 +e 1 75.666666666667 e 2 37.8 e 3 48 -e 4 37.285714285714285 +e 4 37.285714285714 e 5 -11 -e NULL 40.333333333333336 -NULL 1 16.681818181818183 -NULL 2 8.363636363636363 -NULL 3 20.789473684210527 -NULL 4 1.2608695652173914 -NULL 5 -13.857142857142858 +e NULL 40.333333333333 +NULL 1 16.681818181818 +NULL 2 8.363636363636 +NULL 3 20.789473684211 +NULL 4 1.260869565217 +NULL 5 -13.857142857143 NULL NULL 7.81 # csv_query_rollup_avg @@ -671,7 +671,7 @@ a 5 -101 -12484 a 5 -31 -12907 a 5 36 -16974 a 5 NULL -14121.666666666666 -a NULL NULL 306.04761904761904 +a NULL NULL 306.047619047619 b 1 12 7652 b 1 29 -18218 b 1 54 -18410 @@ -950,11 +950,11 @@ SELECT array_agg(c13) FROM (SELECT * FROM aggregate_test_100 ORDER BY c13 LIMIT query IIRIII select c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by c2 order by c2 ---- -1 367 16.681818181818183 125 -99 22 -2 184 8.363636363636363 122 -117 22 -3 395 20.789473684210527 123 -101 19 -4 29 1.2608695652173914 123 -117 23 -5 -194 -13.857142857142858 118 -101 14 +1 367 16.681818181818 125 -99 22 +2 184 8.363636363636 122 -117 22 +3 395 20.789473684211 123 -101 19 +4 29 1.260869565217 123 -117 23 +5 -194 -13.857142857143 118 -101 14 # TODO: csv_query_array_agg_unsupported # statement error @@ -965,40 +965,40 @@ query TIIRIII select c1, c2, sum(c3) sum_c3, avg(c3) avg_c3, max(c3) max_c3, min(c3) min_c3, count(c3) count_c3 from aggregate_test_100 group by CUBE (c1,c2) order by c1, c2 ---- a 1 -88 -17.6 83 -85 5 -a 2 -46 -15.333333333333334 45 -48 3 +a 2 -46 -15.333333333333 45 -48 3 a 3 -27 -4.5 17 -72 6 a 4 -128 -32 65 -101 4 a 5 -96 -32 36 -101 3 -a NULL -385 -18.333333333333332 83 -101 21 -b 1 95 31.666666666666668 54 12 3 +a NULL -385 -18.333333333333 83 -101 21 +b 1 95 31.666666666667 54 12 3 b 2 102 25.5 68 -60 4 b 3 -84 -42 17 -101 2 b 4 -223 -44.6 47 -117 5 b 5 -1 -0.2 68 -82 5 -b NULL -111 -5.842105263157895 68 -117 19 +b NULL -111 -5.842105263158 68 -117 19 c 1 190 47.5 103 -24 4 -c 2 -389 -55.57142857142857 29 -117 7 +c 2 -389 -55.571428571429 29 -117 7 c 3 190 47.5 97 -2 4 c 4 -43 -10.75 123 -90 4 c 5 24 12 118 -94 2 -c NULL -28 -1.3333333333333333 123 -117 21 -d 1 -57 -8.142857142857142 125 -99 7 -d 2 328 109.33333333333333 122 93 3 -d 3 124 41.333333333333336 123 -76 3 +c NULL -28 -1.333333333333 123 -117 21 +d 1 -57 -8.142857142857 125 -99 7 +d 2 328 109.333333333333 122 93 3 +d 3 124 41.333333333333 123 -76 3 d 4 162 54 102 5 3 d 5 -99 -49.5 -40 -59 2 -d NULL 458 25.444444444444443 125 -99 18 -e 1 227 75.66666666666667 120 36 3 +d NULL 458 25.444444444444 125 -99 18 +e 1 227 75.666666666667 120 36 3 e 2 189 37.8 97 -61 5 e 3 192 48 112 -95 4 -e 4 261 37.285714285714285 97 -56 7 +e 4 261 37.285714285714 97 -56 7 e 5 -22 -11 64 -86 2 -e NULL 847 40.333333333333336 120 -95 21 -NULL 1 367 16.681818181818183 125 -99 22 -NULL 2 184 8.363636363636363 122 -117 22 -NULL 3 395 20.789473684210527 123 -101 19 -NULL 4 29 1.2608695652173914 123 -117 23 -NULL 5 -194 -13.857142857142858 118 -101 14 +e NULL 847 40.333333333333 120 -95 21 +NULL 1 367 16.681818181818 125 -99 22 +NULL 2 184 8.363636363636 122 -117 22 +NULL 3 395 20.789473684211 123 -101 19 +NULL 4 29 1.260869565217 123 -117 23 +NULL 5 -194 -13.857142857143 118 -101 14 NULL NULL 781 7.81 125 -117 100 # csv_query_array_agg_distinct @@ -1060,14 +1060,14 @@ select max(c1) from d_table query R select sum(c1) from d_table ---- -100.000 +100 # FIX: doesn't check datatype # aggregate_decimal_avg query R select avg(c1) from d_table ---- -5.0000000 +5 # FIX: different test table # aggregate @@ -1182,7 +1182,7 @@ NULL 0 NULL 0 query RRRR select var(sq.column1), var_pop(sq.column1), stddev(sq.column1), stddev_pop(sq.column1) from (values (1.0), (3.0)) as sq; ---- -2 1 1.4142135623730951 1 +2 1 1.414213562373 1 # sum / count for all nulls diff --git a/datafusion/core/tests/sqllogictests/test_files/ddl.slt b/datafusion/core/tests/sqllogictests/test_files/ddl.slt index bdce635a4364..cb7db95e9c81 100644 --- a/datafusion/core/tests/sqllogictests/test_files/ddl.slt +++ b/datafusion/core/tests/sqllogictests/test_files/ddl.slt @@ -329,7 +329,7 @@ LOCATION '../../testing/data/csv/aggregate_test_100.csv'; query IIIIIIIIIIIII SELECT c1, c2, c3, c4, c5, c6, c7, c8, c9, 10, c11, c12, c13 FROM aggregate_test_100 LIMIT 1; ---- -c 2 1 18109 2033001162 -6513304855495910254 25 43062 1491205016 10 0.110830784 0.9294097332465232 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW +c 2 1 18109 2033001162 -6513304855495910254 25 43062 1491205016 10 0.110830784 0.929409733247 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW diff --git a/datafusion/core/tests/sqllogictests/test_files/tpch.slt b/datafusion/core/tests/sqllogictests/test_files/tpch.slt index 2c11983d59e5..313745113201 100644 --- a/datafusion/core/tests/sqllogictests/test_files/tpch.slt +++ b/datafusion/core/tests/sqllogictests/test_files/tpch.slt @@ -141,9 +141,9 @@ order by l_returnflag, l_linestatus; ---- -A F 27.0000000000 39890.8800000000 37497.4272000000 40122.2471040000 27.00000000000000 39890.88000000000000 0.06000000000000 1 -N O 166.0000000000 205387.5000000000 191556.1888000000 199866.7593360000 27.66666666666666 34231.25000000000000 0.07500000000000 6 -R F 94.0000000000 100854.5200000000 92931.3900000000 92931.3900000000 47.00000000000000 50427.26000000000000 0.08000000000000 2 +A F 27 39890.88 37497.4272 40122.247104 27 39890.88 0.06 1 +N O 166 205387.5 191556.1888 199866.759336 27.666666666667 34231.25 0.075 6 +R F 94 100854.52 92931.39 92931.39 47 50427.26 0.08 2 # q2 @@ -863,5 +863,5 @@ group by order by cntrycode; ---- -18 1 8324.0700000000 -30 1 7638.5700000000 +18 1 8324.07 +30 1 7638.57