diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs index 4b6627234334..a841fe532294 100644 --- a/benchmarks/src/tpch/convert.rs +++ b/benchmarks/src/tpch/convert.rs @@ -86,10 +86,11 @@ impl ConvertOpt { // Select all apart from the padding column let selection = csv .schema() - .fields() .iter() .take(schema.fields.len() - 1) - .map(|d| Expr::Column(d.qualified_column())) + .map(|(qualifier, field)| { + Expr::Column(Column::from((qualifier, field.as_ref()))) + }) .collect(); csv = csv.select(selection)?; diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 5f9f3106e14d..6e9c42480c32 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -22,7 +22,7 @@ use arrow::array::{BooleanArray, Int32Array}; use arrow::record_batch::RecordBatch; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; -use datafusion::common::{DFField, DFSchema}; +use datafusion::common::DFSchema; use datafusion::error::Result; use datafusion::optimizer::simplify_expressions::ExprSimplifier; use datafusion::physical_expr::{ @@ -272,32 +272,30 @@ fn expression_type_demo() -> Result<()> { // types of the input expressions. You can provide this information using // a schema. In this case we create a schema where the column `c` is of // type Utf8 (a String / VARCHAR) - let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c", DataType::Utf8, true)], + let schema = DFSchema::from_unqualifed_fields( + vec![Field::new("c", DataType::Utf8, true)].into(), HashMap::new(), - ) - .unwrap(); + )?; assert_eq!("Utf8", format!("{}", expr.get_type(&schema).unwrap())); // Using a schema where the column `foo` is of type Int32 - let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c", DataType::Int32, true)], + let schema = DFSchema::from_unqualifed_fields( + vec![Field::new("c", DataType::Int32, true)].into(), HashMap::new(), - ) - .unwrap(); + )?; assert_eq!("Int32", format!("{}", expr.get_type(&schema).unwrap())); // Get the type of an expression that adds 2 columns. Adding an Int32 // and Float32 results in Float32 type let expr = col("c1") + col("c2"); - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![ - DFField::new_unqualified("c1", DataType::Int32, true), - DFField::new_unqualified("c2", DataType::Float32, true), - ], + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Float32, true), + ] + .into(), HashMap::new(), - ) - .unwrap(); + )?; assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); Ok(()) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index f0edc7175948..16f9579c668c 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -17,6 +17,8 @@ //! Column +use arrow_schema::Field; + use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; use crate::{DFSchema, DataFusionError, OwnedTableReference, Result, SchemaError}; @@ -178,11 +180,12 @@ impl Column { } for schema in schemas { - let fields = schema.fields_with_unqualified_name(&self.name); - match fields.len() { + let qualified_fields = + schema.qualified_fields_with_unqualified_name(&self.name); + match qualified_fields.len() { 0 => continue, 1 => { - return Ok(fields[0].qualified_column()); + return Ok(Column::from(qualified_fields[0])); } _ => { // More than 1 fields in this schema have their names set to self.name. @@ -198,14 +201,13 @@ impl Column { // We will use the relation from the first matched field to normalize self. // Compare matched fields with one USING JOIN clause at a time + let columns = schema.columns_with_unqualified_name(&self.name); for using_col in using_columns { - let all_matched = fields - .iter() - .all(|f| using_col.contains(&f.qualified_column())); + let all_matched = columns.iter().all(|f| using_col.contains(f)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(fields[0].qualified_column()); + return Ok(columns[0].clone()); } } } @@ -214,10 +216,7 @@ impl Column { _schema_err!(SchemaError::FieldNotFound { field: Box::new(Column::new(self.relation.clone(), self.name)), - valid_fields: schemas - .iter() - .flat_map(|s| s.fields().iter().map(|f| f.qualified_column())) - .collect(), + valid_fields: schemas.iter().flat_map(|s| s.columns()).collect(), }) } @@ -267,13 +266,13 @@ impl Column { } for schema_level in schemas { - let fields = schema_level + let qualified_fields = schema_level .iter() - .flat_map(|s| s.fields_with_unqualified_name(&self.name)) + .flat_map(|s| s.qualified_fields_with_unqualified_name(&self.name)) .collect::>(); - match fields.len() { + match qualified_fields.len() { 0 => continue, - 1 => return Ok(fields[0].qualified_column()), + 1 => return Ok(Column::from(qualified_fields[0])), _ => { // More than 1 fields in this schema have their names set to self.name. // @@ -288,14 +287,16 @@ impl Column { // We will use the relation from the first matched field to normalize self. // Compare matched fields with one USING JOIN clause at a time + let columns = schema_level + .iter() + .flat_map(|s| s.columns_with_unqualified_name(&self.name)) + .collect::>(); for using_col in using_columns { - let all_matched = fields - .iter() - .all(|f| using_col.contains(&f.qualified_column())); + let all_matched = columns.iter().all(|c| using_col.contains(c)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(fields[0].qualified_column()); + return Ok(columns[0].clone()); } } @@ -312,7 +313,7 @@ impl Column { valid_fields: schemas .iter() .flat_map(|s| s.iter()) - .flat_map(|s| s.fields().iter().map(|f| f.qualified_column())) + .flat_map(|s| s.columns()) .collect(), }) } @@ -338,6 +339,13 @@ impl From for Column { } } +/// Create a column, use qualifier and field name +impl From<(Option<&OwnedTableReference>, &Field)> for Column { + fn from((relation, field): (Option<&OwnedTableReference>, &Field)) -> Self { + Self::new(relation.cloned(), field.name()) + } +} + impl FromStr for Column { type Err = Infallible; @@ -355,36 +363,25 @@ impl fmt::Display for Column { #[cfg(test)] mod tests { use super::*; - use crate::DFField; use arrow::datatypes::DataType; - use std::collections::HashMap; - - fn create_schema(names: &[(Option<&str>, &str)]) -> Result { - let fields = names - .iter() - .map(|(qualifier, name)| { - DFField::new( - qualifier.to_owned().map(|s| s.to_string()), - name, - DataType::Boolean, - true, - ) - }) - .collect::>(); - DFSchema::new_with_metadata(fields, HashMap::new()) + use arrow_schema::{Field, SchemaBuilder}; + + fn create_qualified_schema(qualifier: &str, names: Vec<&str>) -> Result { + let mut schema_builder = SchemaBuilder::new(); + schema_builder.extend( + names + .iter() + .map(|f| Field::new(*f, DataType::Boolean, true)), + ); + let schema = Arc::new(schema_builder.finish()); + DFSchema::try_from_qualified_schema(qualifier, &schema) } #[test] fn test_normalize_with_schemas_and_ambiguity_check() -> Result<()> { - let schema1 = create_schema(&[(Some("t1"), "a"), (Some("t1"), "b")])?; - let schema2 = create_schema(&[(Some("t2"), "c"), (Some("t2"), "d")])?; - let schema3 = create_schema(&[ - (Some("t3"), "a"), - (Some("t3"), "b"), - (Some("t3"), "c"), - (Some("t3"), "d"), - (Some("t3"), "e"), - ])?; + let schema1 = create_qualified_schema("t1", vec!["a", "b"])?; + let schema2 = create_qualified_schema("t2", vec!["c", "d"])?; + let schema3 = create_qualified_schema("t3", vec!["a", "b", "c", "d", "e"])?; // already normalized let col = Column::new(Some("t1"), "a"); diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 968d8215ca4d..30ab9a339b54 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1362,6 +1362,8 @@ impl TableOptions { } } +/// Options that control how Parquet files are read, including global options +/// that apply to all columns and optional column-specific overrides #[derive(Clone, Default, Debug, PartialEq)] pub struct TableParquetOptions { /// Global Parquet options that propagates to all columns. diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 90fb0b035d35..f098f98a744c 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -24,16 +24,15 @@ use std::fmt::{Display, Formatter}; use std::hash::Hash; use std::sync::Arc; -use crate::error::{ - unqualified_field_not_found, DataFusionError, Result, SchemaError, _plan_err, - _schema_err, -}; +use crate::error::{DataFusionError, Result, _plan_err, _schema_err}; use crate::{ - field_not_found, Column, FunctionalDependencies, OwnedTableReference, TableReference, + field_not_found, unqualified_field_not_found, Column, FunctionalDependencies, + OwnedTableReference, SchemaError, TableReference, }; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; +use arrow_schema::SchemaBuilder; /// A reference-counted reference to a [DFSchema]. pub type DFSchemaRef = Arc; @@ -95,22 +94,24 @@ pub type DFSchemaRef = Arc; /// Use the `Into` trait to convert `DFSchema` into an Arrow schema: /// /// ```rust -/// use datafusion_common::{DFSchema, DFField}; +/// use datafusion_common::DFSchema; /// use arrow_schema::Schema; +/// use arrow::datatypes::Field; /// use std::collections::HashMap; /// -/// let df_schema = DFSchema::new_with_metadata(vec![ -/// DFField::new_unqualified("c1", arrow::datatypes::DataType::Int32, false), -/// ], HashMap::new()).unwrap(); +/// let df_schema = DFSchema::from_unqualifed_fields(vec![ +/// Field::new("c1", arrow::datatypes::DataType::Int32, false), +/// ].into(),HashMap::new()).unwrap(); /// let schema = Schema::from(df_schema); /// assert_eq!(schema.fields().len(), 1); /// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct DFSchema { - /// Fields - fields: Vec, - /// Additional metadata in form of key value pairs - metadata: HashMap, + /// Inner Arrow schema reference. + inner: SchemaRef, + /// Optional qualifiers for each column in this schema. In the same order as + /// the `self.inner.fields()` + field_qualifiers: Vec>, /// Stores functional dependencies in the schema. functional_dependencies: FunctionalDependencies, } @@ -119,66 +120,107 @@ impl DFSchema { /// Creates an empty `DFSchema` pub fn empty() -> Self { Self { - fields: vec![], - metadata: HashMap::new(), + inner: Arc::new(Schema::new([])), + field_qualifiers: vec![], functional_dependencies: FunctionalDependencies::empty(), } } - /// Create a new `DFSchema` + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier pub fn new_with_metadata( - fields: Vec, + qualified_fields: Vec<(Option, Arc)>, metadata: HashMap, ) -> Result { + let (qualifiers, fields): (Vec>, Vec>) = + qualified_fields.into_iter().unzip(); + + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + + let dfschema = Self { + inner: schema, + field_qualifiers: qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + dfschema.check_names()?; + Ok(dfschema) + } + + /// Create a new `DFSchema` from a list of Arrow [Field]s + pub fn from_unqualifed_fields( + fields: Fields, + metadata: HashMap, + ) -> Result { + let field_count = fields.len(); + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + let dfschema = Self { + inner: schema, + field_qualifiers: vec![None; field_count], + functional_dependencies: FunctionalDependencies::empty(), + }; + dfschema.check_names()?; + Ok(dfschema) + } + + /// Create a `DFSchema` from an Arrow schema and a given qualifier + /// + /// To create a schema from an Arrow schema without a qualifier, use + /// `DFSchema::try_from`. + pub fn try_from_qualified_schema<'a>( + qualifier: impl Into>, + schema: &Schema, + ) -> Result { + let qualifier = qualifier.into(); + let owned_qualifier = qualifier.to_owned_reference(); + let schema = DFSchema { + inner: schema.clone().into(), + field_qualifiers: vec![Some(owned_qualifier); schema.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + schema.check_names()?; + Ok(schema) + } + + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier + pub fn from_field_specific_qualified_schema<'a>( + qualifiers: Vec>>>, + schema: &SchemaRef, + ) -> Result { + let owned_qualifiers = qualifiers + .into_iter() + .map(|qualifier| qualifier.map(|q| q.into().to_owned_reference())) + .collect(); + let dfschema = Self { + inner: schema.clone(), + field_qualifiers: owned_qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + dfschema.check_names()?; + Ok(dfschema) + } + + /// Check if the schema have some fields with the same name + pub fn check_names(&self) -> Result<()> { let mut qualified_names = BTreeSet::new(); let mut unqualified_names = BTreeSet::new(); - for field in &fields { - if let Some(qualifier) = field.qualifier() { + for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { + if let Some(qualifier) = qualifier { qualified_names.insert((qualifier, field.name())); } else if !unqualified_names.insert(field.name()) { return _schema_err!(SchemaError::DuplicateUnqualifiedField { - name: field.name().to_string(), + name: field.name().to_string() }); } } - // Check for mix of qualified and unqualified fields with same unqualified name. - // The BTreeSet storage makes sure that errors are reported in deterministic order. - for (qualifier, name) in &qualified_names { + for (qualifier, name) in qualified_names { if unqualified_names.contains(name) { return _schema_err!(SchemaError::AmbiguousReference { - field: Column { - relation: Some((*qualifier).clone()), - name: name.to_string(), - } + field: Column::new(Some(qualifier.to_owned_reference()), name) }); } } - Ok(Self { - fields, - metadata, - functional_dependencies: FunctionalDependencies::empty(), - }) - } - - /// Create a `DFSchema` from an Arrow schema and a given qualifier - /// - /// To create a schema from an Arrow schema without a qualifier, use - /// `DFSchema::try_from`. - pub fn try_from_qualified_schema<'a>( - qualifier: impl Into>, - schema: &Schema, - ) -> Result { - let qualifier = qualifier.into(); - Self::new_with_metadata( - schema - .fields() - .iter() - .map(|f| DFField::from_qualified(qualifier.clone(), f.clone())) - .collect(), - schema.metadata().clone(), - ) + Ok(()) } /// Assigns functional dependencies. @@ -186,7 +228,7 @@ impl DFSchema { mut self, functional_dependencies: FunctionalDependencies, ) -> Result { - if functional_dependencies.is_valid(self.fields.len()) { + if functional_dependencies.is_valid(self.inner.fields.len()) { self.functional_dependencies = functional_dependencies; Ok(self) } else { @@ -200,50 +242,82 @@ impl DFSchema { /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { - let mut fields = self.fields.clone(); - let mut metadata = self.metadata.clone(); - fields.extend_from_slice(schema.fields().as_slice()); - metadata.extend(schema.metadata.clone()); - Self::new_with_metadata(fields, metadata) + let mut schema_builder = SchemaBuilder::new(); + schema_builder.extend(self.inner.fields().iter().cloned()); + schema_builder.extend(schema.fields().iter().cloned()); + let new_schema = schema_builder.finish(); + + let mut new_metadata = self.inner.metadata.clone(); + new_metadata.extend(schema.inner.metadata.clone()); + let new_schema_with_metadata = new_schema.with_metadata(new_metadata); + + let mut new_qualifiers = self.field_qualifiers.clone(); + new_qualifiers.extend_from_slice(schema.field_qualifiers.as_slice()); + + let new_self = Self { + inner: Arc::new(new_schema_with_metadata), + field_qualifiers: new_qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + new_self.check_names()?; + Ok(new_self) } /// Modify this schema by appending the fields from the supplied schema, ignoring any /// duplicate fields. pub fn merge(&mut self, other_schema: &DFSchema) { - if other_schema.fields.is_empty() { + if other_schema.inner.fields.is_empty() { return; } - let self_fields: HashSet<&DFField> = self.fields.iter().collect(); - let self_unqualified_names: HashSet<&str> = - self.fields.iter().map(|x| x.name().as_str()).collect(); - - let mut fields_to_add = vec![]; + let self_fields: HashSet<(Option<&OwnedTableReference>, &FieldRef)> = + self.iter().collect(); + let self_unqualified_names: HashSet<&str> = self + .inner + .fields + .iter() + .map(|field| field.name().as_str()) + .collect(); - for field in other_schema.fields() { + let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); + let mut qualifiers = Vec::new(); + for (qualifier, field) in other_schema.iter() { // skip duplicate columns - let duplicated_field = match field.qualifier() { - Some(_) => self_fields.contains(field), + let duplicated_field = match qualifier { + Some(q) => self_fields.contains(&(Some(q), field)), // for unqualified columns, check as unqualified name None => self_unqualified_names.contains(field.name().as_str()), }; if !duplicated_field { - fields_to_add.push(field.clone()); + // self.inner.fields.push(field.clone()); + schema_builder.push(field.clone()); + qualifiers.push(qualifier.cloned()); } } - self.fields.extend(fields_to_add); - self.metadata.extend(other_schema.metadata.clone()) + let mut metadata = self.inner.metadata.clone(); + metadata.extend(other_schema.inner.metadata.clone()); + + let finished = schema_builder.finish(); + let finished_with_metadata = finished.with_metadata(metadata); + self.inner = finished_with_metadata.into(); + self.field_qualifiers.extend(qualifiers); } /// Get a list of fields - pub fn fields(&self) -> &Vec { - &self.fields + pub fn fields(&self) -> &Fields { + &self.inner.fields } /// Returns an immutable reference of a specific `Field` instance selected using an /// offset within the internal `fields` vector - pub fn field(&self, i: usize) -> &DFField { - &self.fields[i] + pub fn field(&self, i: usize) -> &Field { + &self.inner.fields[i] + } + + /// Returns an immutable reference of a specific `Field` instance selected using an + /// offset within the internal `fields` vector and its qualifier + pub fn qualified_field(&self, i: usize) -> (Option<&OwnedTableReference>, &Field) { + (self.field_qualifiers[i].as_ref(), self.field(i)) } pub fn index_of_column_by_name( @@ -252,21 +326,18 @@ impl DFSchema { name: &str, ) -> Result> { let mut matches = self - .fields .iter() .enumerate() - .filter(|(_, field)| match (qualifier, &field.qualifier) { + .filter(|(_, (q, f))| match (qualifier, q) { // field to lookup is qualified. // current field is qualified and not shared between relations, compare both // qualifier and name. - (Some(q), Some(field_q)) => { - q.resolved_eq(field_q) && field.name() == name - } + (Some(q), Some(field_q)) => q.resolved_eq(field_q) && f.name() == name, // field to lookup is qualified but current field is unqualified. (Some(qq), None) => { // the original field may now be aliased with a name that matches the // original qualified name - let column = Column::from_qualified_name(field.name()); + let column = Column::from_qualified_name(f.name()); match column { Column { relation: Some(r), @@ -276,7 +347,7 @@ impl DFSchema { } } // field to lookup is unqualified, no need to compare qualifier - (None, Some(_)) | (None, None) => field.name() == name, + (None, Some(_)) | (None, None) => f.name() == name, }) .map(|(idx, _)| idx); Ok(matches.next()) @@ -299,7 +370,7 @@ impl DFSchema { &self, qualifier: Option<&TableReference>, name: &str, - ) -> Result<&DFField> { + ) -> Result<&Field> { if let Some(qualifier) = qualifier { self.field_with_qualified_name(qualifier, name) } else { @@ -307,11 +378,29 @@ impl DFSchema { } } + /// Find the qualified field with the given name + pub fn qualified_field_with_name( + &self, + qualifier: Option<&TableReference>, + name: &str, + ) -> Result<(Option<&OwnedTableReference>, &Field)> { + if let Some(qualifier) = qualifier { + let idx = self + .index_of_column_by_name(Some(qualifier), name)? + .ok_or_else(|| { + field_not_found(Some(qualifier.to_string()), name, self) + })?; + Ok((self.field_qualifiers[idx].as_ref(), self.field(idx))) + } else { + self.qualified_field_with_unqualified_name(name) + } + } + /// Find all fields having the given qualifier - pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&DFField> { - self.fields - .iter() - .filter(|field| field.qualifier().map(|q| q.eq(qualifier)).unwrap_or(false)) + pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { + self.iter() + .filter(|(q, _)| q.map(|q| q.eq(qualifier)).unwrap_or(false)) + .map(|(_, f)| f.as_ref()) .collect() } @@ -320,31 +409,90 @@ impl DFSchema { &self, qualifier: &TableReference, ) -> Vec { - self.fields - .iter() + self.iter() .enumerate() - .filter_map(|(idx, field)| { - field - .qualifier() - .and_then(|q| q.eq(qualifier).then_some(idx)) - }) + .filter_map(|(idx, (q, _))| q.and_then(|q| q.eq(qualifier).then_some(idx))) .collect() } - /// Find all fields match the given name - pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&DFField> { - self.fields + /// Find all fields that match the given name + pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> { + self.fields() .iter() .filter(|field| field.name() == name) + .map(|f| f.as_ref()) .collect() } + /// Find all fields that match the given name and return them with their qualifier + pub fn qualified_fields_with_unqualified_name( + &self, + name: &str, + ) -> Vec<(Option<&OwnedTableReference>, &Field)> { + self.iter() + .filter(|(_, field)| field.name() == name) + .map(|(qualifier, field)| (qualifier, field.as_ref())) + .collect() + } + + /// Find all fields that match the given name and convert to column + pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { + self.iter() + .filter(|(_, field)| field.name() == name) + .map(|(qualifier, field)| Column::new(qualifier.cloned(), field.name())) + .collect() + } + + /// Return all `Column`s for the schema + pub fn columns(&self) -> Vec { + self.iter() + .map(|(qualifier, field)| { + Column::new(qualifier.cloned(), field.name().clone()) + }) + .collect() + } + + /// Find the qualified field with the given unqualified name + pub fn qualified_field_with_unqualified_name( + &self, + name: &str, + ) -> Result<(Option<&OwnedTableReference>, &Field)> { + let matches = self.qualified_fields_with_unqualified_name(name); + match matches.len() { + 0 => Err(unqualified_field_not_found(name, self)), + 1 => Ok((matches[0].0, &matches[0].1)), + _ => { + // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. + // Because name may generate from Alias/... . It means that it don't own qualifier. + // For example: + // Join on id = b.id + // Project a.id as id TableScan b id + // In this case, there isn't `ambiguous name` problem. When `matches` just contains + // one field without qualifier, we should return it. + let fields_without_qualifier = matches + .iter() + .filter(|(q, _)| q.is_none()) + .collect::>(); + if fields_without_qualifier.len() == 1 { + Ok((fields_without_qualifier[0].0, fields_without_qualifier[0].1)) + } else { + _schema_err!(SchemaError::AmbiguousReference { + field: Column { + relation: None, + name: name.to_string(), + }, + }) + } + } + } + } + /// Find the field with the given name - pub fn field_with_unqualified_name(&self, name: &str) -> Result<&DFField> { - let matches = self.fields_with_unqualified_name(name); + pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { + let matches = self.qualified_fields_with_unqualified_name(name); match matches.len() { 0 => Err(unqualified_field_not_found(name, self)), - 1 => Ok(matches[0]), + 1 => Ok(matches[0].1), _ => { // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. // Because name may generate from Alias/... . It means that it don't own qualifier. @@ -355,10 +503,10 @@ impl DFSchema { // one field without qualifier, we should return it. let fields_without_qualifier = matches .iter() - .filter(|f| f.qualifier.is_none()) + .filter(|(q, _)| q.is_none()) .collect::>(); if fields_without_qualifier.len() == 1 { - Ok(fields_without_qualifier[0]) + Ok(fields_without_qualifier[0].1) } else { _schema_err!(SchemaError::AmbiguousReference { field: Column { @@ -376,7 +524,7 @@ impl DFSchema { &self, qualifier: &TableReference, name: &str, - ) -> Result<&DFField> { + ) -> Result<&Field> { let idx = self .index_of_column_by_name(Some(qualifier), name)? .ok_or_else(|| field_not_found(Some(qualifier.to_string()), name, self))?; @@ -385,13 +533,21 @@ impl DFSchema { } /// Find the field with the given qualified column - pub fn field_from_column(&self, column: &Column) -> Result<&DFField> { + pub fn field_from_column(&self, column: &Column) -> Result<&Field> { match &column.relation { Some(r) => self.field_with_qualified_name(r, &column.name), None => self.field_with_unqualified_name(&column.name), } } + /// Find the field with the given qualified column + pub fn qualified_field_from_column( + &self, + column: &Column, + ) -> Result<(Option<&OwnedTableReference>, &Field)> { + self.qualified_field_with_name(column.relation.as_ref(), &column.name) + } + /// Find if the field exists with the given name pub fn has_column_with_unqualified_name(&self, name: &str) -> bool { self.fields().iter().any(|field| field.name() == name) @@ -403,10 +559,8 @@ impl DFSchema { qualifier: &TableReference, name: &str, ) -> bool { - self.fields().iter().any(|field| { - field.qualifier().map(|q| q.eq(qualifier)).unwrap_or(false) - && field.name() == name - }) + self.iter() + .any(|(q, f)| q.map(|q| q.eq(qualifier)).unwrap_or(false) && f.name() == name) } /// Find if the field exists with the given qualified column @@ -419,7 +573,8 @@ impl DFSchema { /// Check to see if unqualified field names matches field names in Arrow schema pub fn matches_arrow_schema(&self, arrow_schema: &Schema) -> bool { - self.fields + self.inner + .fields .iter() .zip(arrow_schema.fields().iter()) .all(|(dffield, arrowfield)| dffield.name() == arrowfield.name()) @@ -457,10 +612,10 @@ impl DFSchema { if self.fields().len() != other.fields().len() { return false; } - let self_fields = self.fields().iter(); - let other_fields = other.fields().iter(); - self_fields.zip(other_fields).all(|(f1, f2)| { - f1.qualifier() == f2.qualifier() + let self_fields = self.iter(); + let other_fields = other.iter(); + self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { + q1 == q2 && f1.name() == f2.name() && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type()) }) @@ -479,10 +634,10 @@ impl DFSchema { if self.fields().len() != other.fields().len() { return false; } - let self_fields = self.fields().iter(); - let other_fields = other.fields().iter(); - self_fields.zip(other_fields).all(|(f1, f2)| { - f1.qualifier() == f2.qualifier() + let self_fields = self.iter(); + let other_fields = other.iter(); + self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { + q1 == q2 && f1.name() == f2.name() && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) }) @@ -588,12 +743,9 @@ impl DFSchema { /// Strip all field qualifier in schema pub fn strip_qualifiers(self) -> Self { DFSchema { - fields: self - .fields - .into_iter() - .map(|f| f.strip_qualifier()) - .collect(), - ..self + field_qualifiers: vec![None; self.inner.fields.len()], + inner: self.inner, + functional_dependencies: self.functional_dependencies, } } @@ -601,47 +753,53 @@ impl DFSchema { pub fn replace_qualifier(self, qualifier: impl Into) -> Self { let qualifier = qualifier.into(); DFSchema { - fields: self - .fields - .into_iter() - .map(|f| DFField::from_qualified(qualifier.clone(), f.field)) - .collect(), - ..self + field_qualifiers: vec![Some(qualifier); self.inner.fields.len()], + inner: self.inner, + functional_dependencies: self.functional_dependencies, } } /// Get list of fully-qualified field names in this schema pub fn field_names(&self) -> Vec { - self.fields - .iter() - .map(|f| f.qualified_name()) + self.iter() + .map(|(qualifier, field)| qualified_name(qualifier, field.name())) .collect::>() } /// Get metadata of this schema pub fn metadata(&self) -> &HashMap { - &self.metadata + &self.inner.metadata } /// Get functional dependencies pub fn functional_dependencies(&self) -> &FunctionalDependencies { &self.functional_dependencies } + + /// Iterate over the qualifiers and fields in the DFSchema + pub fn iter( + &self, + ) -> impl Iterator, &FieldRef)> { + self.field_qualifiers + .iter() + .zip(self.inner.fields().iter()) + .map(|(qualifier, field)| (qualifier.as_ref(), field)) + } } impl From for Schema { /// Convert DFSchema into a Schema fn from(df_schema: DFSchema) -> Self { - let fields: Fields = df_schema.fields.into_iter().map(|f| f.field).collect(); - Schema::new_with_metadata(fields, df_schema.metadata) + let fields: Fields = df_schema.inner.fields.clone(); + Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) } } impl From<&DFSchema> for Schema { /// Convert DFSchema reference into a Schema fn from(df_schema: &DFSchema) -> Self { - let fields: Fields = df_schema.fields.iter().map(|f| f.field.clone()).collect(); - Schema::new_with_metadata(fields, df_schema.metadata.clone()) + let fields: Fields = df_schema.inner.fields.clone(); + Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) } } @@ -649,14 +807,13 @@ impl From<&DFSchema> for Schema { impl TryFrom for DFSchema { type Error = DataFusionError; fn try_from(schema: Schema) -> Result { - Self::new_with_metadata( - schema - .fields() - .iter() - .map(|f| DFField::from(f.clone())) - .collect(), - schema.metadata().clone(), - ) + let field_count = schema.fields.len(); + let dfschema = Self { + inner: schema.into(), + field_qualifiers: vec![None; field_count], + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(dfschema) } } @@ -669,8 +826,8 @@ impl From for SchemaRef { // Hashing refers to a subset of fields considered in PartialEq. impl Hash for DFSchema { fn hash(&self, state: &mut H) { - self.fields.hash(state); - self.metadata.len().hash(state); // HashMap is not hashable + self.inner.fields.hash(state); + self.inner.metadata.len().hash(state); // HashMap is not hashable } } @@ -705,9 +862,19 @@ impl ToDFSchema for SchemaRef { } } -impl ToDFSchema for Vec { +impl ToDFSchema for Vec { fn to_dfschema(self) -> Result { - DFSchema::new_with_metadata(self, HashMap::new()) + let field_count = self.len(); + let schema = Schema { + fields: self.into(), + metadata: HashMap::new(), + }; + let dfschema = DFSchema { + inner: schema.into(), + field_qualifiers: vec![None; field_count], + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(dfschema) } } @@ -716,12 +883,11 @@ impl Display for DFSchema { write!( f, "fields:[{}], metadata:{:?}", - self.fields - .iter() - .map(|field| field.qualified_name()) + self.iter() + .map(|(q, f)| qualified_name(q, f.name())) .collect::>() .join(", "), - self.metadata + self.inner.metadata ) } } @@ -783,138 +949,6 @@ impl ExprSchema for DFSchema { } } -/// DFField wraps an Arrow field and adds an optional qualifier -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct DFField { - /// Optional qualifier (usually a table or relation name) - qualifier: Option, - /// Arrow field definition - field: FieldRef, -} - -impl DFField { - /// Creates a new `DFField` - pub fn new>( - qualifier: Option, - name: &str, - data_type: DataType, - nullable: bool, - ) -> Self { - DFField { - qualifier: qualifier.map(|s| s.into()), - field: Arc::new(Field::new(name, data_type, nullable)), - } - } - - /// Convenience method for creating new `DFField` without a qualifier - pub fn new_unqualified(name: &str, data_type: DataType, nullable: bool) -> Self { - DFField { - qualifier: None, - field: Arc::new(Field::new(name, data_type, nullable)), - } - } - - /// Create a qualified field from an existing Arrow field - pub fn from_qualified<'a>( - qualifier: impl Into>, - field: impl Into, - ) -> Self { - Self { - qualifier: Some(qualifier.into().to_owned_reference()), - field: field.into(), - } - } - - /// Returns an immutable reference to the `DFField`'s unqualified name - pub fn name(&self) -> &String { - self.field.name() - } - - /// Returns an immutable reference to the `DFField`'s data-type - pub fn data_type(&self) -> &DataType { - self.field.data_type() - } - - /// Indicates whether this `DFField` supports null values - pub fn is_nullable(&self) -> bool { - self.field.is_nullable() - } - - pub fn metadata(&self) -> &HashMap { - self.field.metadata() - } - - /// Returns a string to the `DFField`'s qualified name - pub fn qualified_name(&self) -> String { - if let Some(qualifier) = &self.qualifier { - format!("{}.{}", qualifier, self.field.name()) - } else { - self.field.name().to_owned() - } - } - - /// Builds a qualified column based on self - pub fn qualified_column(&self) -> Column { - Column { - relation: self.qualifier.clone(), - name: self.field.name().to_string(), - } - } - - /// Builds an unqualified column based on self - pub fn unqualified_column(&self) -> Column { - Column { - relation: None, - name: self.field.name().to_string(), - } - } - - /// Get the optional qualifier - pub fn qualifier(&self) -> Option<&OwnedTableReference> { - self.qualifier.as_ref() - } - - /// Get the arrow field - pub fn field(&self) -> &FieldRef { - &self.field - } - - /// Return field with qualifier stripped - pub fn strip_qualifier(mut self) -> Self { - self.qualifier = None; - self - } - - /// Return field with nullable specified - pub fn with_nullable(mut self, nullable: bool) -> Self { - let f = self.field().as_ref().clone().with_nullable(nullable); - self.field = f.into(); - self - } - - /// Return field with new metadata - pub fn with_metadata(mut self, metadata: HashMap) -> Self { - let f = self.field().as_ref().clone().with_metadata(metadata); - self.field = f.into(); - self - } -} - -impl From for DFField { - fn from(value: FieldRef) -> Self { - Self { - qualifier: None, - field: value, - } - } -} - -impl From for DFField { - fn from(value: Field) -> Self { - Self::from(Arc::new(value)) - } -} - /// DataFusion-specific extensions to [`Schema`]. pub trait SchemaExt { /// This is a specialized version of Eq that ignores differences @@ -967,6 +1001,13 @@ impl SchemaExt for Schema { } } +pub fn qualified_name(qualifier: Option<&TableReference>, name: &str) -> String { + match qualifier { + Some(q) => format!("{}.{}", q, name), + None => name.to_string(), + } +} + #[cfg(test)] mod tests { use crate::assert_contains; @@ -1007,22 +1048,6 @@ mod tests { Ok(()) } - #[test] - fn from_unqualified_field() { - let field = Field::new("c0", DataType::Boolean, true); - let field = DFField::from(field); - assert_eq!("c0", field.name()); - assert_eq!("c0", field.qualified_name()); - } - - #[test] - fn from_qualified_field() { - let field = Field::new("c0", DataType::Boolean, true); - let field = DFField::from_qualified("t1", field); - assert_eq!("c0", field.name()); - assert_eq!("t1.c0", field.qualified_name()); - } - #[test] fn from_unqualified_schema() -> Result<()> { let schema = DFSchema::try_from(test_schema_1())?; @@ -1037,6 +1062,35 @@ mod tests { Ok(()) } + #[test] + fn test_from_field_specific_qualified_schema() -> Result<()> { + let schema = DFSchema::from_field_specific_qualified_schema( + vec![Some("t1"), None], + &Arc::new(Schema::new(vec![ + Field::new("c0", DataType::Boolean, true), + Field::new("c1", DataType::Boolean, true), + ])), + )?; + assert_eq!("fields:[t1.c0, c1], metadata:{}", schema.to_string()); + Ok(()) + } + + #[test] + fn test_from_qualified_fields() -> Result<()> { + let schema = DFSchema::new_with_metadata( + vec![ + ( + Some("t0".into()), + Arc::new(Field::new("c0", DataType::Boolean, true)), + ), + (None, Arc::new(Field::new("c1", DataType::Boolean, true))), + ], + HashMap::new(), + )?; + assert_eq!("fields:[t0.c0, c1], metadata:{}", schema.to_string()); + Ok(()) + } + #[test] fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; @@ -1168,252 +1222,6 @@ mod tests { assert_eq!(err.strip_backtrace(), "Schema error: No field named c0."); } - #[test] - fn equivalent_names_and_types() { - let arrow_field1 = Field::new("f1", DataType::Int16, true); - let arrow_field1_meta = arrow_field1.clone().with_metadata(test_metadata_n(2)); - - let field1_i16_t = DFField::from(arrow_field1); - let field1_i16_t_meta = DFField::from(arrow_field1_meta); - let field1_i16_t_qualified = - DFField::from_qualified("foo", field1_i16_t.field().clone()); - let field1_i16_f = DFField::from(Field::new("f1", DataType::Int16, false)); - let field1_i32_t = DFField::from(Field::new("f1", DataType::Int32, true)); - let field2_i16_t = DFField::from(Field::new("f2", DataType::Int16, true)); - let field3_i16_t = DFField::from(Field::new("f3", DataType::Int16, true)); - - let dict = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); - let field_dict_t = DFField::from(Field::new("f_dict", dict.clone(), true)); - let field_dict_f = DFField::from(Field::new("f_dict", dict, false)); - - let list_t = DFField::from(Field::new_list( - "f_list", - field1_i16_t.field().clone(), - true, - )); - let list_f = DFField::from(Field::new_list( - "f_list", - field1_i16_f.field().clone(), - false, - )); - - let list_f_name = DFField::from(Field::new_list( - "f_list", - field2_i16_t.field().clone(), - false, - )); - - let struct_t = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_t.field().clone()], - true, - )); - let struct_f = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_f.field().clone()], - false, - )); - - let struct_f_meta = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_t_meta.field().clone()], - false, - )); - - let struct_f_type = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i32_t.field().clone()], - false, - )); - - // same - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // same but metadata is different, should still be true - TestCase { - fields1: vec![&field1_i16_t_meta], - fields2: vec![&field1_i16_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // different name - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field2_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different type - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i32_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different nullability - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // different qualifier - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_t_qualified], - expected_dfschema: false, - expected_arrow: true, - } - .run(); - - // different name after first - TestCase { - fields1: vec![&field2_i16_t, &field1_i16_t], - fields2: vec![&field2_i16_t, &field3_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different number - TestCase { - fields1: vec![&field1_i16_t, &field2_i16_t], - fields2: vec![&field1_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // dictionary - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field_dict_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // dictionary (different nullable) - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field_dict_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // dictionary (wrong type) - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field1_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // list (different embedded nullability) - TestCase { - fields1: vec![&list_t], - fields2: vec![&list_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // list (different sub field names) - TestCase { - fields1: vec![&list_t], - fields2: vec![&list_f_name], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // struct - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // struct (different embedded meta) - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f_meta], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // struct (different field type) - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f_type], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - #[derive(Debug)] - struct TestCase<'a> { - fields1: Vec<&'a DFField>, - fields2: Vec<&'a DFField>, - expected_dfschema: bool, - expected_arrow: bool, - } - - impl<'a> TestCase<'a> { - fn run(self) { - println!("Running {self:#?}"); - let schema1 = to_df_schema(self.fields1); - let schema2 = to_df_schema(self.fields2); - assert_eq!( - schema1.equivalent_names_and_types(&schema2), - self.expected_dfschema, - "Comparison did not match expected: {}\n\n\ - schema1:\n\n{:#?}\n\nschema2:\n\n{:#?}", - self.expected_dfschema, - schema1, - schema2 - ); - - let arrow_schema1 = Schema::from(schema1); - let arrow_schema2 = Schema::from(schema2); - assert_eq!( - arrow_schema1.equivalent_names_and_types(&arrow_schema2), - self.expected_arrow, - "Comparison did not match expected: {}\n\n\ - arrow schema1:\n\n{:#?}\n\n arrow schema2:\n\n{:#?}", - self.expected_arrow, - arrow_schema1, - arrow_schema2 - ); - } - } - - fn to_df_schema(fields: Vec<&DFField>) -> DFSchema { - let fields = fields.into_iter().cloned().collect(); - DFSchema::new_with_metadata(fields, HashMap::new()).unwrap() - } - } - #[test] fn into() { // Demonstrate how to convert back and forth between Schema, SchemaRef, DFSchema, and DFSchemaRef @@ -1424,11 +1232,11 @@ mod tests { ); let arrow_schema_ref = Arc::new(arrow_schema.clone()); - let df_schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c0", DataType::Int64, true)], - metadata, - ) - .unwrap(); + let df_schema = DFSchema { + inner: arrow_schema_ref.clone(), + field_qualifiers: vec![None; arrow_schema_ref.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; let df_schema_ref = Arc::new(df_schema.clone()); { @@ -1468,16 +1276,15 @@ mod tests { b_metadata.insert("key".to_string(), "value".to_string()); let b_field = Field::new("b", DataType::Int64, false).with_metadata(b_metadata); - let a: DFField = DFField::from_qualified("table1", a_field); - let b: DFField = DFField::from_qualified("table1", b_field); + let schema = Arc::new(Schema::new(vec![a_field, b_field])); - let df_schema = Arc::new( - DFSchema::new_with_metadata([a, b].to_vec(), HashMap::new()).unwrap(), - ); - let schema: Schema = df_schema.as_ref().clone().into(); - let a_df = df_schema.fields.first().unwrap().field(); - let a_arrow = schema.fields.first().unwrap(); - assert_eq!(a_df.metadata(), a_arrow.metadata()) + let df_schema = DFSchema { + inner: schema.clone(), + field_qualifiers: vec![None; schema.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + + assert_eq!(df_schema.inner.metadata(), schema.metadata()) } #[test] diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index cafab6d334b3..234b65392222 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -613,11 +613,7 @@ pub fn field_not_found>( ) -> DataFusionError { schema_datafusion_err!(SchemaError::FieldNotFound { field: Box::new(Column::new(qualifier, name)), - valid_fields: schema - .fields() - .iter() - .map(|f| f.qualified_column()) - .collect(), + valid_fields: schema.columns().to_vec(), }) } @@ -625,11 +621,7 @@ pub fn field_not_found>( pub fn unqualified_field_not_found(name: &str, schema: &DFSchema) -> DataFusionError { schema_datafusion_err!(SchemaError::FieldNotFound { field: Box::new(Column::new_unqualified(name)), - valid_fields: schema - .fields() - .iter() - .map(|f| f.qualified_column()) - .collect(), + valid_fields: schema.columns().to_vec(), }) } diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs index 1cb1751d713e..2eab0ece6d8b 100644 --- a/datafusion/common/src/functional_dependencies.rs +++ b/datafusion/common/src/functional_dependencies.rs @@ -73,16 +73,14 @@ impl Constraints { is_primary, .. } => { + let field_names = df_schema.field_names(); // Get primary key and/or unique indices in the schema: let indices = columns .iter() .map(|pk| { - let idx = df_schema - .fields() + let idx = field_names .iter() - .position(|item| { - item.qualified_name() == pk.value.clone() - }) + .position(|item| *item == pk.value) .ok_or_else(|| { DataFusionError::Execution( "Primary key doesn't exist".to_string(), @@ -452,7 +450,7 @@ pub fn aggregate_functional_dependencies( aggr_schema: &DFSchema, ) -> FunctionalDependencies { let mut aggregate_func_dependencies = vec![]; - let aggr_input_fields = aggr_input_schema.fields(); + let aggr_input_fields = aggr_input_schema.field_names(); let aggr_fields = aggr_schema.fields(); // Association covers the whole table: let target_indices = (0..aggr_schema.fields().len()).collect::>(); @@ -470,14 +468,14 @@ pub fn aggregate_functional_dependencies( let mut new_source_field_names = vec![]; let source_field_names = source_indices .iter() - .map(|&idx| aggr_input_fields[idx].qualified_name()) + .map(|&idx| &aggr_input_fields[idx]) .collect::>(); for (idx, group_by_expr_name) in group_by_expr_names.iter().enumerate() { // When one of the input determinant expressions matches with // the GROUP BY expression, add the index of the GROUP BY // expression as a new determinant key: - if source_field_names.contains(group_by_expr_name) { + if source_field_names.contains(&group_by_expr_name) { new_source_indices.push(idx); new_source_field_names.push(group_by_expr_name.clone()); } @@ -538,11 +536,7 @@ pub fn get_target_functional_dependencies( ) -> Option> { let mut combined_target_indices = HashSet::new(); let dependencies = schema.functional_dependencies(); - let field_names = schema - .fields() - .iter() - .map(|item| item.qualified_name()) - .collect::>(); + let field_names = schema.field_names(); for FunctionalDependence { source_indices, target_indices, @@ -551,7 +545,7 @@ pub fn get_target_functional_dependencies( { let source_key_names = source_indices .iter() - .map(|id_key_idx| field_names[*id_key_idx].clone()) + .map(|id_key_idx| &field_names[*id_key_idx]) .collect::>(); // If the GROUP BY expression contains a determinant key, we can use // the associated fields after aggregation even if they are not part @@ -577,11 +571,7 @@ pub fn get_required_group_by_exprs_indices( group_by_expr_names: &[String], ) -> Option> { let dependencies = schema.functional_dependencies(); - let field_names = schema - .fields() - .iter() - .map(|item| item.qualified_name()) - .collect::>(); + let field_names = schema.field_names(); let mut groupby_expr_indices = group_by_expr_names .iter() .map(|group_by_expr_name| { diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index da7d6579bfe6..4d2e8b7417fb 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -46,7 +46,9 @@ pub mod utils; /// Reexport arrow crate pub use arrow; pub use column::Column; -pub use dfschema::{DFField, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema}; +pub use dfschema::{ + qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, +}; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, SharedResult, diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index eea5fc1127ce..1db4f8ede692 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -195,11 +195,15 @@ impl DataFrame { pub fn select_columns(self, columns: &[&str]) -> Result { let fields = columns .iter() - .map(|name| self.plan.schema().field_with_unqualified_name(name)) + .map(|name| { + self.plan + .schema() + .qualified_field_with_unqualified_name(name) + }) .collect::>>()?; let expr: Vec = fields - .iter() - .map(|f| Expr::Column(f.qualified_column())) + .into_iter() + .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field)))) .collect(); self.select(expr) } @@ -1240,14 +1244,13 @@ impl DataFrame { let mut col_exists = false; let mut fields: Vec = plan .schema() - .fields() .iter() - .map(|f| { - if f.name() == name { + .map(|(qualifier, field)| { + if field.name() == name { col_exists = true; new_column.clone() } else { - col(f.qualified_column()) + col(Column::from((qualifier, field.as_ref()))) } }) .collect(); @@ -1298,24 +1301,25 @@ impl DataFrame { Column::from_qualified_name_ignore_case(old_name) }; - let field_to_rename = match self.plan.schema().field_from_column(&old_column) { - Ok(field) => field, - // no-op if field not found - Err(DataFusionError::SchemaError(SchemaError::FieldNotFound { .. }, _)) => { - return Ok(self) - } - Err(err) => return Err(err), - }; + let (qualifier_rename, field_rename) = + match self.plan.schema().qualified_field_from_column(&old_column) { + Ok(qualifier_and_field) => qualifier_and_field, + // no-op if field not found + Err(DataFusionError::SchemaError( + SchemaError::FieldNotFound { .. }, + _, + )) => return Ok(self), + Err(err) => return Err(err), + }; let projection = self .plan .schema() - .fields() .iter() - .map(|f| { - if f == field_to_rename { - col(f.qualified_column()).alias(new_name) + .map(|(qualifier, field)| { + if qualifier.eq(&qualifier_rename) && field.as_ref() == field_rename { + col(Column::from((qualifier, field.as_ref()))).alias(new_name) } else { - col(f.qualified_column()) + col(Column::from((qualifier, field.as_ref()))) } }) .collect::>(); diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index c53e8df35de8..f97d465c442b 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -31,13 +31,15 @@ use arrow::{ record_batch::RecordBatch, }; use arrow_schema::Fields; -use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; -use datafusion_common::{internal_err, Column, DFField, DFSchema, DataFusionError}; use datafusion_expr::execution_props::ExecutionProps; +use futures::stream::FuturesUnordered; +use futures::{stream::BoxStream, StreamExt, TryStreamExt}; +use log::{debug, trace}; + +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; +use datafusion_common::{internal_err, Column, DFSchema, DataFusionError}; use datafusion_expr::{Expr, ScalarFunctionDefinition, Volatility}; use datafusion_physical_expr::create_physical_expr; -use futures::stream::{BoxStream, FuturesUnordered, StreamExt, TryStreamExt}; -use log::{debug, trace}; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; @@ -267,10 +269,10 @@ async fn prune_partitions( .collect(); let schema = Arc::new(Schema::new(fields)); - let df_schema = DFSchema::new_with_metadata( + let df_schema = DFSchema::from_unqualifed_fields( partition_cols .iter() - .map(|(n, d)| DFField::new_unqualified(n, d.clone(), true)) + .map(|(n, d)| Field::new(n, d.clone(), true)) .collect(), Default::default(), )?; diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 377dad5cee6c..c4a888f5462d 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -89,9 +89,10 @@ pub struct ParquetExec { metadata_size_hint: Option, /// Optional user defined parquet file reader factory parquet_file_reader_factory: Option>, + /// Cached plan properties such as equivalence properties, ordering, partitioning, etc. cache: PlanProperties, - /// Parquet Options - parquet_options: TableParquetOptions, + /// Options for reading Parquet files + table_parquet_options: TableParquetOptions, } impl ParquetExec { @@ -100,7 +101,7 @@ impl ParquetExec { base_config: FileScanConfig, predicate: Option>, metadata_size_hint: Option, - parquet_options: TableParquetOptions, + table_parquet_options: TableParquetOptions, ) -> Self { debug!("Creating ParquetExec, files: {:?}, projection {:?}, predicate: {:?}, limit: {:?}", base_config.file_groups, base_config.projection, predicate, base_config.limit); @@ -155,15 +156,20 @@ impl ParquetExec { metadata_size_hint, parquet_file_reader_factory: None, cache, - parquet_options, + table_parquet_options, } } - /// Ref to the base configs + /// [`FileScanConfig`] that controls this scan (such as which files to read) pub fn base_config(&self) -> &FileScanConfig { &self.base_config } + /// Options passed to the parquet reader for this scan + pub fn table_parquet_options(&self) -> &TableParquetOptions { + &self.table_parquet_options + } + /// Optional predicate. pub fn predicate(&self) -> Option<&Arc> { self.predicate.as_ref() @@ -197,13 +203,13 @@ impl ParquetExec { /// /// [`Expr`]: datafusion_expr::Expr pub fn with_pushdown_filters(mut self, pushdown_filters: bool) -> Self { - self.parquet_options.global.pushdown_filters = pushdown_filters; + self.table_parquet_options.global.pushdown_filters = pushdown_filters; self } /// Return the value described in [`Self::with_pushdown_filters`] fn pushdown_filters(&self) -> bool { - self.parquet_options.global.pushdown_filters + self.table_parquet_options.global.pushdown_filters } /// If true, the `RowFilter` made by `pushdown_filters` may try to @@ -213,13 +219,13 @@ impl ParquetExec { /// /// [`Expr`]: datafusion_expr::Expr pub fn with_reorder_filters(mut self, reorder_filters: bool) -> Self { - self.parquet_options.global.reorder_filters = reorder_filters; + self.table_parquet_options.global.reorder_filters = reorder_filters; self } /// Return the value described in [`Self::with_reorder_filters`] fn reorder_filters(&self) -> bool { - self.parquet_options.global.reorder_filters + self.table_parquet_options.global.reorder_filters } /// If enabled, the reader will read the page index @@ -227,24 +233,24 @@ impl ParquetExec { /// via `RowSelector` and `RowFilter` by /// eliminating unnecessary IO and decoding pub fn with_enable_page_index(mut self, enable_page_index: bool) -> Self { - self.parquet_options.global.enable_page_index = enable_page_index; + self.table_parquet_options.global.enable_page_index = enable_page_index; self } /// Return the value described in [`Self::with_enable_page_index`] fn enable_page_index(&self) -> bool { - self.parquet_options.global.enable_page_index + self.table_parquet_options.global.enable_page_index } /// If enabled, the reader will read by the bloom filter pub fn with_enable_bloom_filter(mut self, enable_bloom_filter: bool) -> Self { - self.parquet_options.global.bloom_filter_enabled = enable_bloom_filter; + self.table_parquet_options.global.bloom_filter_enabled = enable_bloom_filter; self } /// Return the value described in [`Self::with_enable_bloom_filter`] fn enable_bloom_filter(&self) -> bool { - self.parquet_options.global.bloom_filter_enabled + self.table_parquet_options.global.bloom_filter_enabled } fn output_partitioning_helper(file_config: &FileScanConfig) -> Partitioning { diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs index a82c5d97a2b7..8df4925fc566 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs @@ -234,6 +234,10 @@ impl PruningStatistics for BloomFilterStatistics { ScalarValue::Int32(Some(v)) => sbbf.check(v), ScalarValue::Int16(Some(v)) => sbbf.check(v), ScalarValue::Int8(Some(v)) => sbbf.check(v), + ScalarValue::UInt64(Some(v)) => sbbf.check(v), + ScalarValue::UInt32(Some(v)) => sbbf.check(v), + ScalarValue::UInt16(Some(v)) => sbbf.check(v), + ScalarValue::UInt8(Some(v)) => sbbf.check(v), ScalarValue::Decimal128(Some(v), p, s) => match parquet_type { Type::INT32 => { //https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42 diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index 85fb8939886c..d1b7dad15225 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -21,6 +21,7 @@ use std::{any::Any, sync::Arc}; use arrow::datatypes::SchemaRef; use async_trait::async_trait; +use datafusion_common::Column; use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown}; use crate::{ @@ -126,9 +127,9 @@ impl TableProvider for ViewTable { let fields: Vec = projection .iter() .map(|i| { - Expr::Column( - self.logical_plan.schema().field(*i).qualified_column(), - ) + Expr::Column(Column::from( + self.logical_plan.schema().qualified_field(*i), + )) }) .collect(); plan.project(fields)? diff --git a/datafusion/core/src/lib.rs b/datafusion/core/src/lib.rs index 5dc3e1ce7d3f..f6e2171d6b5f 100644 --- a/datafusion/core/src/lib.rs +++ b/datafusion/core/src/lib.rs @@ -167,6 +167,11 @@ //! overview of how DataFusion is organized and then link to other //! sections of the docs with more details --> //! +//! You can find a formal description of DataFusion's architecture in our +//! [SIGMOD 2024 Paper]. +//! +//! [SIGMOD 2024 Paper]: https://github.com/apache/arrow-datafusion/files/14789704/DataFusion_Query_Engine___SIGMOD_2024-FINAL.pdf +//! //! ## Overview Presentations //! //! The following presentations offer high level overviews of the diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 0a1730e944d3..4733c1433ad0 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1022,10 +1022,9 @@ impl DefaultPhysicalPlanner { // Remove temporary projected columns let join_plan = if added_project { let final_join_result = join_schema - .fields() .iter() - .map(|field| { - Expr::Column(field.qualified_column()) + .map(|(qualifier, field)| { + Expr::Column(datafusion_common::Column::from((qualifier, field.as_ref()))) }) .collect::>(); let projection = @@ -1089,18 +1088,19 @@ impl DefaultPhysicalPlanner { let (filter_df_fields, filter_fields): (Vec<_>, Vec<_>) = left_field_indices.clone() .into_iter() .map(|i| ( - left_df_schema.field(i).clone(), + left_df_schema.qualified_field(i), physical_left.schema().field(i).clone(), )) .chain( right_field_indices.clone() .into_iter() .map(|i| ( - right_df_schema.field(i).clone(), + right_df_schema.qualified_field(i), physical_right.schema().field(i).clone(), )) ) .unzip(); + let filter_df_fields = filter_df_fields.into_iter().map(|(qualifier, field)| (qualifier.cloned(), Arc::new(field.clone()))).collect(); // Construct intermediate schemas used for filtering data and // convert logical expression to physical according to filter schema @@ -2012,9 +2012,7 @@ mod tests { use arrow::array::{ArrayRef, DictionaryArray, Int32Array}; use arrow::datatypes::{DataType, Field, Int32Type, SchemaRef}; use arrow::record_batch::RecordBatch; - use datafusion_common::{ - assert_contains, DFField, DFSchema, DFSchemaRef, TableReference, - }; + use datafusion_common::{assert_contains, DFSchema, DFSchemaRef, TableReference}; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_expr::{ @@ -2257,25 +2255,23 @@ mod tests { .await; let expected_error: &str = "Error during planning: \ - Extension planner for NoOp created an ExecutionPlan with mismatched schema. \ - LogicalPlan schema: DFSchema { fields: [\ - DFField { qualifier: None, field: Field { \ - name: \"a\", \ + Extension planner for NoOp created an ExecutionPlan with mismatched schema. \ + LogicalPlan schema: \ + DFSchema { inner: Schema { fields: \ + [Field { name: \"a\", \ data_type: Int32, \ nullable: false, \ dict_id: 0, \ - dict_is_ordered: false, \ - metadata: {} } }\ - ], metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }, \ - ExecutionPlan schema: Schema { fields: [\ - Field { \ - name: \"b\", \ + dict_is_ordered: false, metadata: {} }], \ + metadata: {} }, field_qualifiers: [None], \ + functional_dependencies: FunctionalDependencies { deps: [] } }, \ + ExecutionPlan schema: Schema { fields: \ + [Field { name: \"b\", \ data_type: Int32, \ nullable: false, \ dict_id: 0, \ - dict_is_ordered: false, \ - metadata: {} }\ - ], metadata: {} }"; + dict_is_ordered: false, metadata: {} }], \ + metadata: {} }"; match plan { Ok(_) => panic!("Expected planning failure"), Err(e) => assert!( @@ -2539,8 +2535,8 @@ mod tests { fn default() -> Self { Self { schema: DFSchemaRef::new( - DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int32, false)], + DFSchema::from_unqualifed_fields( + vec![Field::new("a", DataType::Int32, false)].into(), HashMap::new(), ) .unwrap(), diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 1da86a0363a5..b4415d638ada 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -66,6 +66,7 @@ enum Scenario { Int, Int32Range, UInt, + UInt32Range, Float64, Decimal, DecimalBloomFilterInt32, @@ -455,6 +456,13 @@ fn make_int32_range(start: i32, end: i32) -> RecordBatch { RecordBatch::try_new(schema, vec![array.clone()]).unwrap() } +fn make_uint32_range(start: u32, end: u32) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("u", DataType::UInt32, true)])); + let v = vec![start, end]; + let array = Arc::new(UInt32Array::from(v)) as ArrayRef; + RecordBatch::try_new(schema, vec![array.clone()]).unwrap() +} + /// Return record batch with f64 vector /// /// Columns are named @@ -659,6 +667,9 @@ fn create_data_batch(scenario: Scenario) -> Vec { make_uint_batches(250, 255), ] } + Scenario::UInt32Range => { + vec![make_uint32_range(0, 10), make_uint32_range(200000, 300000)] + } Scenario::Float64 => { vec![ make_f64_batch(vec![-5.0, -4.0, -3.0, -2.0, -1.0]), diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index b70102f78a96..b7b434d1c3d3 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -339,7 +339,7 @@ macro_rules! int_tests { async fn []() { RowGroupPruningTest::new() .with_scenario(Scenario::Int) - .with_query(&format!("SELECT * FROM t where i{} = 1", $bits)) + .with_query(&format!("SELECT * FROM t where abs(i{}) = 1 and i{} = 1", $bits, $bits)) .with_expected_errors(Some(0)) .with_matched_by_stats(Some(1)) .with_pruned_by_stats(Some(3)) @@ -452,6 +452,144 @@ int_tests!(16, correct_bloom_filters: false); int_tests!(32, correct_bloom_filters: true); int_tests!(64, correct_bloom_filters: true); +// $bits: number of bits of the integer to test (8, 16, 32, 64) +// $correct_bloom_filters: if false, replicates the +// https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests pass +// if and only if Bloom filters on UInt8 and UInt16 columns are still buggy. +macro_rules! uint_tests { + ($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => { + paste::item! { + #[tokio::test] + async fn []() { + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt) + .with_query(&format!("SELECT * FROM t where u{} < 6", $bits)) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(3)) + .with_pruned_by_stats(Some(1)) + .with_matched_by_bloom_filter(Some(0)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(11) + .test_row_group_prune() + .await; + } + + #[tokio::test] + async fn []() { + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt) + .with_query(&format!("SELECT * FROM t where u{} = 6", $bits)) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(1)) + .with_pruned_by_stats(Some(3)) + .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) + .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) + .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .test_row_group_prune() + .await; + } + #[tokio::test] + async fn []() { + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt) + .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 36 and u{} = 6", $bits, $bits)) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(1)) + .with_pruned_by_stats(Some(3)) + .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) + .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) + .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .test_row_group_prune() + .await; + } + + #[tokio::test] + async fn []() { + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt) + .with_query(&format!("SELECT * FROM t where power(u{}, 2) = 25", $bits)) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(0)) + .with_pruned_by_stats(Some(0)) + .with_matched_by_bloom_filter(Some(0)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(2) + .test_row_group_prune() + .await; + } + + #[tokio::test] + async fn []() { + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt) + .with_query(&format!("SELECT * FROM t where u{}+1 = 6", $bits)) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(0)) + .with_pruned_by_stats(Some(0)) + .with_matched_by_bloom_filter(Some(0)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(2) + .test_row_group_prune() + .await; + } + + #[tokio::test] + async fn []() { + // result of sql "SELECT * FROM t where in (1)" + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt) + .with_query(&format!("SELECT * FROM t where u{} in (6)", $bits)) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(1)) + .with_pruned_by_stats(Some(3)) + .with_matched_by_bloom_filter(Some(if $correct_bloom_filters { 1 } else { 0 })) + .with_pruned_by_bloom_filter(Some(if $correct_bloom_filters { 0 } else { 1 })) + .with_expected_rows(if $correct_bloom_filters { 1 } else { 0 }) + .test_row_group_prune() + .await; + } + + #[tokio::test] + async fn []() { + // result of sql "SELECT * FROM t where in (1000)", prune all + // test whether statistics works + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt) + .with_query(&format!("SELECT * FROM t where u{} in (100)", $bits)) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(0)) + .with_pruned_by_stats(Some(4)) + .with_matched_by_bloom_filter(Some(0)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(0) + .test_row_group_prune() + .await; + } + + #[tokio::test] + async fn []() { + // result of sql "SELECT * FROM t where not in (1)" prune nothing + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt) + .with_query(&format!("SELECT * FROM t where u{} not in (6)", $bits)) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(4)) + .with_pruned_by_stats(Some(0)) + .with_matched_by_bloom_filter(Some(4)) + .with_pruned_by_bloom_filter(Some(0)) + .with_expected_rows(19) + .test_row_group_prune() + .await; + } + } + }; +} + +uint_tests!(8, correct_bloom_filters: false); +uint_tests!(16, correct_bloom_filters: false); +uint_tests!(32, correct_bloom_filters: true); +uint_tests!(64, correct_bloom_filters: true); + #[tokio::test] async fn prune_int32_eq_large_in_list() { // result of sql "SELECT * FROM t where i in (2050...2582)", prune all @@ -474,6 +612,28 @@ async fn prune_int32_eq_large_in_list() { .await; } +#[tokio::test] +async fn prune_uint32_eq_large_in_list() { + // result of sql "SELECT * FROM t where i in (2050...2582)", prune all + RowGroupPruningTest::new() + .with_scenario(Scenario::UInt32Range) + .with_query( + format!( + "SELECT * FROM t where u in ({})", + (200050..200082).join(",") + ) + .as_str(), + ) + .with_expected_errors(Some(0)) + .with_matched_by_stats(Some(1)) + .with_pruned_by_stats(Some(0)) + .with_matched_by_bloom_filter(Some(0)) + .with_pruned_by_bloom_filter(Some(1)) + .with_expected_rows(0) + .test_row_group_prune() + .await; +} + #[tokio::test] async fn prune_f64_lt() { RowGroupPruningTest::new() diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index a1b3b717392e..744192e9a3e1 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -69,18 +69,8 @@ pub enum BuiltinScalarFunction { Pi, /// power Power, - /// radians - Radians, /// round Round, - /// signum - Signum, - /// sin - Sin, - /// sinh - Sinh, - /// sqrt - Sqrt, /// trunc Trunc, /// cot @@ -165,10 +155,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Pi => Volatility::Immutable, BuiltinScalarFunction::Power => Volatility::Immutable, BuiltinScalarFunction::Round => Volatility::Immutable, - BuiltinScalarFunction::Signum => Volatility::Immutable, - BuiltinScalarFunction::Sin => Volatility::Immutable, - BuiltinScalarFunction::Sinh => Volatility::Immutable, - BuiltinScalarFunction::Sqrt => Volatility::Immutable, BuiltinScalarFunction::Cbrt => Volatility::Immutable, BuiltinScalarFunction::Cot => Volatility::Immutable, BuiltinScalarFunction::Trunc => Volatility::Immutable, @@ -176,7 +162,6 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ConcatWithSeparator => Volatility::Immutable, BuiltinScalarFunction::EndsWith => Volatility::Immutable, BuiltinScalarFunction::InitCap => Volatility::Immutable, - BuiltinScalarFunction::Radians => Volatility::Immutable, // Volatile builtin functions BuiltinScalarFunction::Random => Volatility::Volatile, @@ -241,12 +226,7 @@ impl BuiltinScalarFunction { | BuiltinScalarFunction::Degrees | BuiltinScalarFunction::Exp | BuiltinScalarFunction::Floor - | BuiltinScalarFunction::Radians | BuiltinScalarFunction::Round - | BuiltinScalarFunction::Signum - | BuiltinScalarFunction::Sin - | BuiltinScalarFunction::Sinh - | BuiltinScalarFunction::Sqrt | BuiltinScalarFunction::Cbrt | BuiltinScalarFunction::Trunc | BuiltinScalarFunction::Cot => match input_expr_types[0] { @@ -335,11 +315,6 @@ impl BuiltinScalarFunction { | BuiltinScalarFunction::Degrees | BuiltinScalarFunction::Exp | BuiltinScalarFunction::Floor - | BuiltinScalarFunction::Radians - | BuiltinScalarFunction::Signum - | BuiltinScalarFunction::Sin - | BuiltinScalarFunction::Sinh - | BuiltinScalarFunction::Sqrt | BuiltinScalarFunction::Cot => { // math expressions expect 1 argument of type f64 or f32 // priority is given to f64 because e.g. `sqrt(1i32)` is in IR (real numbers) and thus we @@ -366,11 +341,7 @@ impl BuiltinScalarFunction { | BuiltinScalarFunction::Exp | BuiltinScalarFunction::Factorial | BuiltinScalarFunction::Floor - | BuiltinScalarFunction::Radians | BuiltinScalarFunction::Round - | BuiltinScalarFunction::Signum - | BuiltinScalarFunction::Sinh - | BuiltinScalarFunction::Sqrt | BuiltinScalarFunction::Cbrt | BuiltinScalarFunction::Trunc | BuiltinScalarFunction::Pi @@ -402,13 +373,8 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::Nanvl => &["nanvl"], BuiltinScalarFunction::Pi => &["pi"], BuiltinScalarFunction::Power => &["power", "pow"], - BuiltinScalarFunction::Radians => &["radians"], BuiltinScalarFunction::Random => &["random"], BuiltinScalarFunction::Round => &["round"], - BuiltinScalarFunction::Signum => &["signum"], - BuiltinScalarFunction::Sin => &["sin"], - BuiltinScalarFunction::Sinh => &["sinh"], - BuiltinScalarFunction::Sqrt => &["sqrt"], BuiltinScalarFunction::Trunc => &["trunc"], // conditional functions diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index a2015787040f..1aa063b17539 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -534,12 +534,9 @@ macro_rules! nary_scalar_expr { // generate methods for creating the supported unary/binary expressions // math functions -scalar_expr!(Sqrt, sqrt, num, "square root of a number"); scalar_expr!(Cbrt, cbrt, num, "cube root of a number"); -scalar_expr!(Sin, sin, num, "sine"); scalar_expr!(Cos, cos, num, "cosine"); scalar_expr!(Cot, cot, num, "cotangent"); -scalar_expr!(Sinh, sinh, num, "hyperbolic sine"); scalar_expr!(Cosh, cosh, num, "hyperbolic cosine"); scalar_expr!(Factorial, factorial, num, "factorial"); scalar_expr!( @@ -555,14 +552,12 @@ scalar_expr!( "nearest integer greater than or equal to argument" ); scalar_expr!(Degrees, degrees, num, "converts radians to degrees"); -scalar_expr!(Radians, radians, num, "converts degrees to radians"); nary_scalar_expr!(Round, round, "round to nearest integer"); nary_scalar_expr!( Trunc, trunc, "truncate toward zero, with optional precision" ); -scalar_expr!(Signum, signum, num, "sign of the argument (-1, 0, +1) "); scalar_expr!(Exp, exp, num, "exponential"); scalar_expr!(Gcd, gcd, arg_1 arg_2, "greatest common divisor"); scalar_expr!(Lcm, lcm, arg_1 arg_2, "least common multiple"); @@ -885,8 +880,8 @@ impl WindowUDFImpl for SimpleWindowUDF { /// ``` /// use datafusion_expr::{col, lit, call_fn}; /// -/// // create the expression sin(x) < 0.2 -/// let expr = call_fn("sin", vec![col("x")]).unwrap().lt(lit(0.2)); +/// // create the expression trunc(x) < 0.2 +/// let expr = call_fn("trunc", vec![col("x")]).unwrap().lt(lit(0.2)); /// ``` pub fn call_fn(name: impl AsRef, args: Vec) -> Result { match name.as_ref().parse::() { @@ -967,23 +962,18 @@ mod test { #[test] fn scalar_function_definitions() { - test_unary_scalar_expr!(Sqrt, sqrt); test_unary_scalar_expr!(Cbrt, cbrt); - test_unary_scalar_expr!(Sin, sin); test_unary_scalar_expr!(Cos, cos); test_unary_scalar_expr!(Cot, cot); - test_unary_scalar_expr!(Sinh, sinh); test_unary_scalar_expr!(Cosh, cosh); test_unary_scalar_expr!(Factorial, factorial); test_unary_scalar_expr!(Floor, floor); test_unary_scalar_expr!(Ceil, ceil); test_unary_scalar_expr!(Degrees, degrees); - test_unary_scalar_expr!(Radians, radians); test_nary_scalar_expr!(Round, round, input); test_nary_scalar_expr!(Round, round, input, decimal_places); test_nary_scalar_expr!(Trunc, trunc, num); test_nary_scalar_expr!(Trunc, trunc, num, precision); - test_unary_scalar_expr!(Signum, signum); test_unary_scalar_expr!(Exp, exp); test_scalar_expr!(Nanvl, nanvl, x, y); test_scalar_expr!(Iszero, iszero, input); diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 7a227a91c455..60942adb6346 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -212,9 +212,10 @@ pub fn coerce_plan_expr_for_schema( _ => { let exprs: Vec = plan .schema() - .fields() .iter() - .map(|field| Expr::Column(field.qualified_column())) + .map(|(qualifier, field)| { + Expr::Column(Column::from((qualifier, field.as_ref()))) + }) .collect(); let new_exprs = coerce_exprs_for_schema(exprs, plan.schema(), schema)?; @@ -283,10 +284,9 @@ mod test { use super::*; use crate::expr::Sort; use crate::{col, lit, Cast}; - - use arrow::datatypes::DataType; + use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::tree_node::{TreeNode, TreeNodeRewriter}; - use datafusion_common::{DFField, DFSchema, ScalarValue}; + use datafusion_common::{DFSchema, OwnedTableReference, ScalarValue}; #[derive(Default)] struct RecordingRewriter { @@ -347,20 +347,21 @@ mod test { let expr = col("a") + col("b") + col("c"); // Schemas with some matching and some non matching cols - let schema_a = make_schema_with_empty_metadata(vec![ - make_field("tableA", "a"), - make_field("tableA", "aa"), - ]); - let schema_c = make_schema_with_empty_metadata(vec![ - make_field("tableC", "cc"), - make_field("tableC", "c"), - ]); - let schema_b = make_schema_with_empty_metadata(vec![make_field("tableB", "b")]); + let schema_a = make_schema_with_empty_metadata( + vec![Some("tableA".into()), Some("tableA".into())], + vec!["a", "aa"], + ); + let schema_c = make_schema_with_empty_metadata( + vec![Some("tableC".into()), Some("tableC".into())], + vec!["cc", "c"], + ); + let schema_b = + make_schema_with_empty_metadata(vec![Some("tableB".into())], vec!["b"]); // non matching - let schema_f = make_schema_with_empty_metadata(vec![ - make_field("tableC", "f"), - make_field("tableC", "ff"), - ]); + let schema_f = make_schema_with_empty_metadata( + vec![Some("tableC".into()), Some("tableC".into())], + vec!["f", "ff"], + ); let schemas = vec![schema_c, schema_f, schema_b, schema_a]; let schemas = schemas.iter().collect::>(); @@ -378,7 +379,7 @@ mod test { // test normalizing columns when the name doesn't exist let expr = col("a") + col("b"); let schema_a = - make_schema_with_empty_metadata(vec![make_field("\"tableA\"", "a")]); + make_schema_with_empty_metadata(vec![Some("\"tableA\"".into())], vec!["a"]); let schemas = [schema_a]; let schemas = schemas.iter().collect::>(); @@ -399,12 +400,16 @@ mod test { assert_eq!(unnormalized_expr, col("a") + col("b")); } - fn make_schema_with_empty_metadata(fields: Vec) -> DFSchema { - DFSchema::new_with_metadata(fields, HashMap::new()).unwrap() - } - - fn make_field(relation: &str, column: &str) -> DFField { - DFField::new(Some(relation.to_string()), column, DataType::Int8, false) + fn make_schema_with_empty_metadata( + qualifiers: Vec>, + fields: Vec<&str>, + ) -> DFSchema { + let fields = fields + .iter() + .map(|f| Arc::new(Field::new(f.to_string(), DataType::Int8, false))) + .collect::>(); + let schema = Arc::new(Schema::new(fields)); + DFSchema::from_field_specific_qualified_schema(qualifiers, &schema).unwrap() } #[test] diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index b1bc11a83f90..2fb522b979b0 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -90,7 +90,7 @@ fn rewrite_in_terms_of_projection( let col = Expr::Column( found .to_field(input.schema()) - .map(|f| f.qualified_column())?, + .map(|(qualifier, field)| Column::new(qualifier, field.name()))?, ); return Ok(Transformed::yes(col)); } diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index f1ac22d584ee..de157f3cda75 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -28,8 +28,8 @@ use crate::{utils, LogicalPlan, Projection, Subquery}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ - internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFField, - ExprSchema, Result, + internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, ExprSchema, + OwnedTableReference, Result, }; use std::collections::HashMap; use std::sync::Arc; @@ -46,7 +46,10 @@ pub trait ExprSchemable { fn metadata(&self, schema: &dyn ExprSchema) -> Result>; /// convert to a field with respect to a schema - fn to_field(&self, input_schema: &dyn ExprSchema) -> Result; + fn to_field( + &self, + input_schema: &dyn ExprSchema, + ) -> Result<(Option, Arc)>; /// cast to a type with respect to a schema fn cast_to(self, cast_to_type: &DataType, schema: &dyn ExprSchema) -> Result; @@ -70,21 +73,20 @@ impl ExprSchemable for Expr { /// ## and Float32 results in Float32 type /// /// ``` - /// # use arrow::datatypes::DataType; - /// # use datafusion_common::{DFField, DFSchema}; + /// # use arrow::datatypes::{DataType, Field}; + /// # use datafusion_common::DFSchema; /// # use datafusion_expr::{col, ExprSchemable}; /// # use std::collections::HashMap; /// /// fn main() { /// let expr = col("c1") + col("c2"); - /// let schema = DFSchema::new_with_metadata( + /// let schema = DFSchema::from_unqualifed_fields( /// vec![ - /// DFField::new_unqualified("c1", DataType::Int32, true), - /// DFField::new_unqualified("c2", DataType::Float32, true), - /// ], + /// Field::new("c1", DataType::Int32, true), + /// Field::new("c2", DataType::Float32, true), + /// ].into(), /// HashMap::new(), - /// ) - /// .unwrap(); + /// ).unwrap(); /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); /// } /// ``` @@ -437,26 +439,37 @@ impl ExprSchemable for Expr { /// /// So for example, a projected expression `col(c1) + col(c2)` is /// placed in an output field **named** col("c1 + c2") - fn to_field(&self, input_schema: &dyn ExprSchema) -> Result { + fn to_field( + &self, + input_schema: &dyn ExprSchema, + ) -> Result<(Option, Arc)> { match self { Expr::Column(c) => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - Ok( - DFField::new(c.relation.clone(), &c.name, data_type, nullable) - .with_metadata(self.metadata(input_schema)?), - ) + Ok(( + c.relation.clone(), + Field::new(&c.name, data_type, nullable) + .with_metadata(self.metadata(input_schema)?) + .into(), + )) } Expr::Alias(Alias { relation, name, .. }) => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - Ok(DFField::new(relation.clone(), name, data_type, nullable) - .with_metadata(self.metadata(input_schema)?)) + Ok(( + relation.clone(), + Field::new(name, data_type, nullable) + .with_metadata(self.metadata(input_schema)?) + .into(), + )) } _ => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; - Ok( - DFField::new_unqualified(&self.display_name()?, data_type, nullable) - .with_metadata(self.metadata(input_schema)?), - ) + Ok(( + None, + Field::new(self.display_name()?, data_type, nullable) + .with_metadata(self.metadata(input_schema)?) + .into(), + )) } } } @@ -535,7 +548,7 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result { - let cast_expr = Expr::Column(plan.schema().field(0).qualified_column()) + let cast_expr = Expr::Column(Column::from(plan.schema().qualified_field(0))) .cast_to(cast_to_type, subquery.subquery.schema())?; LogicalPlan::Projection(Projection::try_new( vec![cast_expr], @@ -553,8 +566,8 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result {{ @@ -679,23 +692,22 @@ mod tests { .unwrap() ); - let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("foo", DataType::Int32, true) - .with_metadata(meta.clone())], + let schema = DFSchema::from_unqualifed_fields( + vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())] + .into(), HashMap::new(), ) .unwrap(); // verify to_field method populates metadata - assert_eq!(&meta, expr.to_field(&schema).unwrap().metadata()); + assert_eq!(&meta, expr.to_field(&schema).unwrap().1.metadata()); } #[test] fn test_nested_schema_nullability() { - let fields = DFField::new( - Some(TableReference::Bare { - table: "table_name".into(), - }), + let mut builder = SchemaBuilder::new(); + builder.push(Field::new("foo", DataType::Int32, true)); + builder.push(Field::new( "parent", DataType::Struct(Fields::from(vec![Field::new( "child", @@ -703,12 +715,17 @@ mod tests { false, )])), true, - ); + )); + let schema = builder.finish(); - let schema = DFSchema::new_with_metadata(vec![fields], HashMap::new()).unwrap(); + let dfschema = DFSchema::from_field_specific_qualified_schema( + vec![Some("table_name"), None], + &Arc::new(schema), + ) + .unwrap(); let expr = col("parent").field("child"); - assert!(expr.nullable(&schema).unwrap()); + assert!(expr.nullable(&dfschema).unwrap()); } #[derive(Debug)] diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index f47249d76d5b..e1f760c6e3f7 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -47,12 +47,12 @@ use crate::{ TableProviderFilterPushDown, TableSource, WriteOp, }; -use arrow::datatypes::{DataType, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use datafusion_common::config::FormatOptions; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ get_target_functional_dependencies, not_impl_err, plan_datafusion_err, plan_err, - Column, DFField, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, Result, + Column, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, Result, ScalarValue, TableReference, ToDFSchema, UnnestOptions, }; @@ -214,18 +214,14 @@ impl LogicalPlanBuilder { .map(|(j, data_type)| { // naming is following convention https://www.postgresql.org/docs/current/queries-values.html let name = &format!("column{}", j + 1); - DFField::new_unqualified( - name, - data_type.clone().unwrap_or(DataType::Utf8), - true, - ) + Field::new(name, data_type.clone().unwrap_or(DataType::Utf8), true) }) .collect::>(); for (i, j) in nulls { values[i][j] = Expr::Literal(ScalarValue::try_from(fields[j].data_type())?); } - let schema = - DFSchemaRef::new(DFSchema::new_with_metadata(fields, HashMap::new())?); + let dfschema = DFSchema::from_unqualifed_fields(fields.into(), HashMap::new())?; + let schema = DFSchemaRef::new(dfschema); Ok(Self::from(LogicalPlan::Values(Values { schema, values }))) } @@ -368,10 +364,9 @@ impl LogicalPlanBuilder { /// Select the given column indices pub fn select(self, indices: impl IntoIterator) -> Result { - let fields = self.plan.schema().fields(); let exprs: Vec<_> = indices .into_iter() - .map(|x| Expr::Column(fields[x].qualified_column())) + .map(|x| Expr::Column(Column::from(self.plan.schema().qualified_field(x)))) .collect(); self.project(exprs) } @@ -557,11 +552,7 @@ impl LogicalPlanBuilder { } // remove pushed down sort columns - let new_expr = schema - .fields() - .iter() - .map(|f| Expr::Column(f.qualified_column())) - .collect(); + let new_expr = schema.columns().into_iter().map(Expr::Column).collect(); let is_distinct = false; let plan = Self::add_missing_columns(self.plan, &missing_cols, is_distinct)?; @@ -1137,7 +1128,7 @@ impl LogicalPlanBuilder { )?)) } } -pub fn change_redundant_column(fields: Vec) -> Vec { +pub fn change_redundant_column(fields: &Fields) -> Vec { let mut name_map = HashMap::new(); fields .into_iter() @@ -1146,14 +1137,9 @@ pub fn change_redundant_column(fields: Vec) -> Vec { *counter += 1; if *counter > 1 { let new_name = format!("{}:{}", field.name(), *counter - 1); - DFField::new( - field.qualifier().cloned(), - &new_name, - field.data_type().clone(), - field.is_nullable(), - ) + Field::new(new_name, field.data_type().clone(), field.is_nullable()) } else { - field + field.as_ref().clone() } }) .collect() @@ -1165,67 +1151,82 @@ pub fn build_join_schema( right: &DFSchema, join_type: &JoinType, ) -> Result { - fn nullify_fields(fields: &[DFField]) -> Vec { + fn nullify_fields<'a>( + fields: impl Iterator, &'a Arc)>, + ) -> Vec<(Option, Arc)> { fields - .iter() - .map(|f| f.clone().with_nullable(true)) + .map(|(q, f)| { + // TODO: find a good way to do that + let field = f.as_ref().clone().with_nullable(true); + (q.map(|r| r.to_owned_reference()), Arc::new(field)) + }) .collect() } - let right_fields = right.fields(); - let left_fields = left.fields(); + let right_fields = right.iter(); + let left_fields = left.iter(); - let fields: Vec = match join_type { + let qualified_fields: Vec<(Option, Arc)> = match join_type + { JoinType::Inner => { // left then right - left_fields - .iter() - .chain(right_fields.iter()) - .cloned() - .collect() + let left_fields = left_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect::>(); + let right_fields = right_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect::>(); + left_fields.into_iter().chain(right_fields).collect() } JoinType::Left => { // left then right, right set to nullable in case of not matched scenario + let left_fields = left_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect::>(); left_fields - .iter() - .chain(&nullify_fields(right_fields)) - .cloned() + .into_iter() + .chain(nullify_fields(right_fields)) .collect() } JoinType::Right => { // left then right, left set to nullable in case of not matched scenario + let right_fields = right_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect::>(); nullify_fields(left_fields) - .iter() - .chain(right_fields.iter()) - .cloned() + .into_iter() + .chain(right_fields) .collect() } JoinType::Full => { // left then right, all set to nullable in case of not matched scenario nullify_fields(left_fields) - .iter() - .chain(&nullify_fields(right_fields)) - .cloned() + .into_iter() + .chain(nullify_fields(right_fields)) .collect() } JoinType::LeftSemi | JoinType::LeftAnti => { // Only use the left side for the schema - left_fields.clone() + left_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect() } JoinType::RightSemi | JoinType::RightAnti => { // Only use the right side for the schema - right_fields.clone() + right_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect() } }; let func_dependencies = left.functional_dependencies().join( right.functional_dependencies(), join_type, - left_fields.len(), + left.fields().len(), ); let mut metadata = left.metadata().clone(); metadata.extend(right.metadata().clone()); - let schema = DFSchema::new_with_metadata(fields, metadata)?; - schema.with_functional_dependencies(func_dependencies) + let dfschema = DFSchema::new_with_metadata(qualified_fields, metadata)?; + dfschema.with_functional_dependencies(func_dependencies) } /// Add additional "synthetic" group by expressions based on functional @@ -1252,9 +1253,7 @@ fn add_group_by_exprs_from_dependencies( get_target_functional_dependencies(schema, &group_by_field_names) { for idx in target_indices { - let field = schema.field(idx); - let expr = - Expr::Column(Column::new(field.qualifier().cloned(), field.name())); + let expr = Expr::Column(Column::from(schema.qualified_field(idx))); let expr_name = expr.display_name()?; if !group_by_field_names.contains(&expr_name) { group_by_field_names.push(expr_name); @@ -1325,33 +1324,33 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result>>()? - .to_dfschema()?; + .collect::>>()?; + let union_schema = + DFSchema::new_with_metadata(union_qualified_fields, HashMap::new())?; let inputs = vec![left_plan, right_plan] .into_iter() @@ -1551,18 +1550,18 @@ pub fn unnest_with_options( column: Column, options: UnnestOptions, ) -> Result { - let unnest_field = input.schema().field_from_column(&column)?; + let (unnest_qualifier, unnest_field) = + input.schema().qualified_field_from_column(&column)?; // Extract the type of the nested field in the list. let unnested_field = match unnest_field.data_type() { DataType::List(field) | DataType::FixedSizeList(field, _) - | DataType::LargeList(field) => DFField::new( - unnest_field.qualifier().cloned(), + | DataType::LargeList(field) => Arc::new(Field::new( unnest_field.name(), field.data_type().clone(), unnest_field.is_nullable(), - ), + )), _ => { // If the unnest field is not a list type return the input plan. return Ok(input); @@ -1572,13 +1571,12 @@ pub fn unnest_with_options( // Update the schema with the unnest column type changed to contain the nested type. let input_schema = input.schema(); let fields = input_schema - .fields() .iter() - .map(|f| { - if f == unnest_field { - unnested_field.clone() + .map(|(q, f)| { + if f.as_ref() == unnest_field && q == unnest_qualifier { + (unnest_qualifier.cloned(), unnested_field.clone()) } else { - f.clone() + (q.cloned(), f.clone()) } }) .collect::>(); @@ -1588,10 +1586,11 @@ pub fn unnest_with_options( // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); + let column = Column::from((unnest_qualifier, unnested_field.as_ref())); Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), - column: unnested_field.qualified_column(), + column, schema, options, })) @@ -2113,23 +2112,23 @@ mod tests { } #[test] fn test_change_redundant_column() -> Result<()> { - let t1_field_1 = DFField::new_unqualified("a", DataType::Int32, false); - let t2_field_1 = DFField::new_unqualified("a", DataType::Int32, false); - let t2_field_3 = DFField::new_unqualified("a", DataType::Int32, false); - let t1_field_2 = DFField::new_unqualified("b", DataType::Int32, false); - let t2_field_2 = DFField::new_unqualified("b", DataType::Int32, false); + let t1_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_3 = Field::new("a", DataType::Int32, false); + let t1_field_2 = Field::new("b", DataType::Int32, false); + let t2_field_2 = Field::new("b", DataType::Int32, false); let field_vec = vec![t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3]; - let remove_redundant = change_redundant_column(field_vec); + let remove_redundant = change_redundant_column(&Fields::from(field_vec)); assert_eq!( remove_redundant, vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("a:1", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), - DFField::new_unqualified("b:1", DataType::Int32, false), - DFField::new_unqualified("a:2", DataType::Int32, false), + Field::new("a", DataType::Int32, false), + Field::new("a:1", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("b:1", DataType::Int32, false), + Field::new("a:2", DataType::Int32, false), ] ); Ok(()) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 0bf5b8dffaa2..7a28b085cbc0 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -49,7 +49,7 @@ use datafusion_common::tree_node::{ }; use datafusion_common::{ aggregate_functional_dependencies, internal_err, plan_err, Column, Constraints, - DFField, DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, + DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, FunctionalDependencies, OwnedTableReference, ParamValues, Result, UnnestOptions, }; @@ -487,12 +487,12 @@ impl LogicalPlan { LogicalPlan::RecursiveQuery(RecursiveQuery { static_term, .. }) => { static_term.head_output_expr() } - LogicalPlan::Union(union) => Ok(Some(Expr::Column( - union.schema.fields()[0].qualified_column(), - ))), - LogicalPlan::TableScan(table) => Ok(Some(Expr::Column( - table.projected_schema.fields()[0].qualified_column(), - ))), + LogicalPlan::Union(union) => Ok(Some(Expr::Column(Column::from( + union.schema.qualified_field(0), + )))), + LogicalPlan::TableScan(table) => Ok(Some(Expr::Column(Column::from( + table.projected_schema.qualified_field(0), + )))), LogicalPlan::SubqueryAlias(subquery_alias) => { let expr_opt = subquery_alias.input.head_output_expr()?; expr_opt @@ -867,24 +867,30 @@ impl LogicalPlan { }) => { // Update schema with unnested column type. let input = Arc::new(inputs.swap_remove(0)); - let nested_field = input.schema().field_from_column(column)?; - let unnested_field = schema.field_from_column(column)?; - let fields = input + let (nested_qualifier, nested_field) = + input.schema().qualified_field_from_column(column)?; + let (unnested_qualifier, unnested_field) = + schema.qualified_field_from_column(column)?; + let qualifiers_and_fields = input .schema() - .fields() .iter() - .map(|f| { - if f == nested_field { - unnested_field.clone() + .map(|(qualifier, field)| { + if qualifier.eq(&nested_qualifier) + && field.as_ref() == nested_field + { + ( + unnested_qualifier.cloned(), + Arc::new(unnested_field.clone()), + ) } else { - f.clone() + (qualifier.cloned(), field.clone()) } }) .collect::>(); let schema = Arc::new( DFSchema::new_with_metadata( - fields, + qualifiers_and_fields, input.schema().metadata().clone(), )? // We can use the existing functional dependencies as is: @@ -1803,12 +1809,7 @@ impl Projection { /// Create a new Projection using the specified output schema pub fn new_from_schema(input: Arc, schema: DFSchemaRef) -> Self { - let expr: Vec = schema - .fields() - .iter() - .map(|field| field.qualified_column()) - .map(Expr::Column) - .collect(); + let expr: Vec = schema.columns().into_iter().map(Expr::Column).collect(); Self { expr, input, @@ -1860,9 +1861,10 @@ impl SubqueryAlias { alias: impl Into, ) -> Result { let alias = alias.into(); - let fields = change_redundant_column(plan.schema().fields().clone()); + let fields = change_redundant_column(plan.schema().fields()); let meta_data = plan.schema().as_ref().metadata().clone(); - let schema: Schema = DFSchema::new_with_metadata(fields, meta_data)?.into(); + let schema: Schema = + DFSchema::from_unqualifed_fields(fields.into(), meta_data)?.into(); // Since schema is the same, other than qualifier, we can use existing // functional dependencies: let func_dependencies = plan.schema().functional_dependencies().clone(); @@ -2007,9 +2009,13 @@ pub struct Window { impl Window { /// Create a new window operator. pub fn try_new(window_expr: Vec, input: Arc) -> Result { - let fields = input.schema().fields(); + let fields: Vec<(Option, Arc)> = input + .schema() + .iter() + .map(|(q, f)| (q.cloned(), f.clone())) + .collect(); let input_len = fields.len(); - let mut window_fields = fields.clone(); + let mut window_fields = fields; let expr_fields = exprlist_to_fields(window_expr.as_slice(), &input)?; window_fields.extend_from_slice(expr_fields.as_slice()); let metadata = input.schema().metadata().clone(); @@ -2134,16 +2140,14 @@ impl TableScan { .map(|p| { let projected_func_dependencies = func_dependencies.project_functional_dependencies(p, p.len()); + let df_schema = DFSchema::new_with_metadata( p.iter() .map(|i| { - DFField::from_qualified( - table_name.clone(), - schema.field(*i).clone(), - ) + (Some(table_name.clone()), Arc::new(schema.field(*i).clone())) }) .collect(), - schema.metadata().clone(), + schema.metadata.clone(), )?; df_schema.with_functional_dependencies(projected_func_dependencies) }) @@ -2335,9 +2339,12 @@ impl DistinctOn { } let on_expr = normalize_cols(on_expr, input.as_ref())?; + let qualified_fields = exprlist_to_fields(select_expr.as_slice(), &input)? + .into_iter() + .collect(); - let schema = DFSchema::new_with_metadata( - exprlist_to_fields(select_expr.as_slice(), &input)?, + let dfschema = DFSchema::new_with_metadata( + qualified_fields, input.schema().metadata().clone(), )?; @@ -2346,7 +2353,7 @@ impl DistinctOn { select_expr, sort_expr: None, input, - schema: Arc::new(schema), + schema: Arc::new(dfschema), }; if let Some(sort_expr) = sort_expr { @@ -2416,20 +2423,19 @@ impl Aggregate { let grouping_expr: Vec = grouping_set_to_exprlist(group_expr.as_slice())?; - let mut fields = exprlist_to_fields(grouping_expr.as_slice(), &input)?; + let mut qualified_fields = exprlist_to_fields(grouping_expr.as_slice(), &input)?; // Even columns that cannot be null will become nullable when used in a grouping set. if is_grouping_set { - fields = fields + qualified_fields = qualified_fields .into_iter() - .map(|field| field.with_nullable(true)) + .map(|(q, f)| (q, f.as_ref().clone().with_nullable(true).into())) .collect::>(); } - fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?); + qualified_fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?); - let schema = - DFSchema::new_with_metadata(fields, input.schema().metadata().clone())?; + let schema = DFSchema::new_with_metadata(qualified_fields, HashMap::new())?; Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(schema)) } @@ -2524,7 +2530,7 @@ fn calc_func_dependencies_for_project( exprs: &[Expr], input: &LogicalPlan, ) -> Result { - let input_fields = input.schema().fields(); + let input_fields = input.schema().field_names(); // Calculate expression indices (if present) in the input schema. let proj_indices = exprs .iter() @@ -2535,9 +2541,7 @@ fn calc_func_dependencies_for_project( } _ => format!("{}", expr), }; - input_fields - .iter() - .position(|item| item.qualified_name() == expr_name) + input_fields.iter().position(|item| *item == expr_name) }) .collect::>(); Ok(input @@ -2673,7 +2677,6 @@ pub struct Unnest { #[cfg(test)] mod tests { - use std::collections::HashMap; use std::sync::Arc; use super::*; @@ -3084,7 +3087,7 @@ digraph { #[test] fn projection_expr_schema_mismatch() -> Result<()> { - let empty_schema = Arc::new(DFSchema::new_with_metadata(vec![], HashMap::new())?); + let empty_schema = Arc::new(DFSchema::empty()); let p = Projection::try_new_with_schema( vec![col("a")], Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { @@ -3258,10 +3261,10 @@ digraph { filters: vec![], fetch: None, })); - let col = schema.field(0).qualified_column(); + let col = schema.field_names()[0].clone(); let filter = Filter::try_new( - Expr::Column(col).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), + Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), scan, ) .unwrap(); @@ -3288,10 +3291,10 @@ digraph { filters: vec![], fetch: None, })); - let col = schema.field(0).qualified_column(); + let col = schema.field_names()[0].clone(); let filter = Filter::try_new( - Expr::Column(col).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), + Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), scan, ) .unwrap(); diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index c7907d0db16a..0edcfa4bb522 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -30,12 +30,12 @@ use crate::{ Operator, TryCast, }; -use arrow::datatypes::{DataType, TimeUnit}; +use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::utils::get_at_indices; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFField, DFSchema, DFSchemaRef, - Result, ScalarValue, TableReference, + internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, + OwnedTableReference, Result, ScalarValue, TableReference, }; use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem, WildcardAdditionalOptions}; @@ -342,12 +342,9 @@ fn get_excluded_columns( let mut result = vec![]; for ident in unique_idents.into_iter() { let col_name = ident.value.as_str(); - let field = if let Some(qualifier) = qualifier { - schema.field_with_qualified_name(qualifier, col_name)? - } else { - schema.field_with_unqualified_name(col_name)? - }; - result.push(field.qualified_column()) + let (qualifier, field) = + schema.qualified_field_with_name(qualifier.as_ref(), col_name)?; + result.push(Column::from((qualifier, field))); } Ok(result) } @@ -359,18 +356,18 @@ fn get_exprs_except_skipped( ) -> Vec { if columns_to_skip.is_empty() { schema - .fields() .iter() - .map(|f| Expr::Column(f.qualified_column())) + .map(|(qualifier, field)| { + Expr::Column(Column::from((qualifier, field.as_ref()))) + }) .collect::>() } else { schema - .fields() + .columns() .iter() - .filter_map(|f| { - let col = f.qualified_column(); - if !columns_to_skip.contains(&col) { - Some(Expr::Column(col)) + .filter_map(|c| { + if !columns_to_skip.contains(c) { + Some(Expr::Column(c.clone())) } else { None } @@ -433,13 +430,14 @@ pub fn expand_qualified_wildcard( let projected_func_dependencies = schema .functional_dependencies() .project_functional_dependencies(&qualified_indices, qualified_indices.len()); - let qualified_fields = get_at_indices(schema.fields(), &qualified_indices)?; - if qualified_fields.is_empty() { + let fields_with_qualified = get_at_indices(schema.fields(), &qualified_indices)?; + if fields_with_qualified.is_empty() { return plan_err!("Invalid qualifier {qualifier}"); } - let qualified_schema = - DFSchema::new_with_metadata(qualified_fields, schema.metadata().clone())? - // We can use the functional dependencies as is, since it only stores indices: + + let qualified_schema = Arc::new(Schema::new(fields_with_qualified)); + let qualified_dfschema = + DFSchema::try_from_qualified_schema(qualifier.clone(), &qualified_schema)? .with_functional_dependencies(projected_func_dependencies)?; let excluded_columns = if let Some(WildcardAdditionalOptions { opt_exclude, @@ -459,7 +457,10 @@ pub fn expand_qualified_wildcard( // Add each excluded `Column` to columns_to_skip let mut columns_to_skip = HashSet::new(); columns_to_skip.extend(excluded_columns); - Ok(get_exprs_except_skipped(&qualified_schema, columns_to_skip)) + Ok(get_exprs_except_skipped( + &qualified_dfschema, + columns_to_skip, + )) } /// (expr, "is the SortExpr for window (either comes from PARTITION BY or ORDER BY columns)") @@ -737,7 +738,10 @@ fn agg_cols(agg: &Aggregate) -> Vec { .collect() } -fn exprlist_to_fields_aggregate(exprs: &[Expr], agg: &Aggregate) -> Result> { +fn exprlist_to_fields_aggregate( + exprs: &[Expr], + agg: &Aggregate, +) -> Result, Arc)>> { let agg_cols = agg_cols(agg); let mut fields = vec![]; for expr in exprs { @@ -753,7 +757,10 @@ fn exprlist_to_fields_aggregate(exprs: &[Expr], agg: &Aggregate) -> Result Result> { +pub fn exprlist_to_fields( + exprs: &[Expr], + plan: &LogicalPlan, +) -> Result, Arc)>> { // when dealing with aggregate plans we cannot simply look in the aggregate output schema // because it will contain columns representing complex expressions (such a column named // `GROUPING(person.state)` so in order to resolve `person.state` in this case we need to @@ -805,11 +812,15 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { )), Expr::ScalarSubquery(_) => e.clone(), _ => match e.display_name() { - Ok(name) => match input_schema.field_with_unqualified_name(&name) { - Ok(field) => Expr::Column(field.qualified_column()), - // expression not provided as input, do not convert to a column reference - Err(_) => e, - }, + Ok(name) => { + match input_schema.qualified_field_with_unqualified_name(&name) { + Ok((qualifier, field)) => { + Expr::Column(Column::from((qualifier, field))) + } + // expression not provided as input, do not convert to a column reference + Err(_) => e, + } + } Err(_) => e, }, } @@ -842,8 +853,8 @@ pub(crate) fn find_columns_referenced_by_expr(e: &Expr) -> Vec { pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { match expr { Expr::Column(col) => { - let field = plan.schema().field_from_column(col)?; - Ok(Expr::Column(field.qualified_column())) + let (qualifier, field) = plan.schema().qualified_field_from_column(col)?; + Ok(Expr::Column(Column::from((qualifier, field)))) } _ => Ok(Expr::Column(Column::from_name(expr.display_name()?))), } diff --git a/datafusion/functions-array/src/array_has.rs b/datafusion/functions-array/src/array_has.rs index 4e4ebaf035fc..ee064335c1cc 100644 --- a/datafusion/functions-array/src/array_has.rs +++ b/datafusion/functions-array/src/array_has.rs @@ -54,11 +54,17 @@ make_udf_function!(ArrayHasAny, ); #[derive(Debug)] -pub(super) struct ArrayHas { +pub struct ArrayHas { signature: Signature, aliases: Vec, } +impl Default for ArrayHas { + fn default() -> Self { + Self::new() + } +} + impl ArrayHas { pub fn new() -> Self { Self { @@ -121,11 +127,17 @@ impl ScalarUDFImpl for ArrayHas { } #[derive(Debug)] -pub(super) struct ArrayHasAll { +pub struct ArrayHasAll { signature: Signature, aliases: Vec, } +impl Default for ArrayHasAll { + fn default() -> Self { + Self::new() + } +} + impl ArrayHasAll { pub fn new() -> Self { Self { @@ -178,11 +190,17 @@ impl ScalarUDFImpl for ArrayHasAll { } #[derive(Debug)] -pub(super) struct ArrayHasAny { +pub struct ArrayHasAny { signature: Signature, aliases: Vec, } +impl Default for ArrayHasAny { + fn default() -> Self { + Self::new() + } +} + impl ArrayHasAny { pub fn new() -> Self { Self { diff --git a/datafusion/functions-array/src/concat.rs b/datafusion/functions-array/src/concat.rs index cb76192e29c2..f9d9bf4356ff 100644 --- a/datafusion/functions-array/src/concat.rs +++ b/datafusion/functions-array/src/concat.rs @@ -45,11 +45,17 @@ make_udf_function!( ); #[derive(Debug)] -pub(super) struct ArrayAppend { +pub struct ArrayAppend { signature: Signature, aliases: Vec, } +impl Default for ArrayAppend { + fn default() -> Self { + Self::new() + } +} + impl ArrayAppend { pub fn new() -> Self { Self { @@ -99,11 +105,17 @@ make_udf_function!( ); #[derive(Debug)] -pub(super) struct ArrayPrepend { +pub struct ArrayPrepend { signature: Signature, aliases: Vec, } +impl Default for ArrayPrepend { + fn default() -> Self { + Self::new() + } +} + impl ArrayPrepend { pub fn new() -> Self { Self { @@ -152,11 +164,17 @@ make_udf_function!( ); #[derive(Debug)] -pub(super) struct ArrayConcat { +pub struct ArrayConcat { signature: Signature, aliases: Vec, } +impl Default for ArrayConcat { + fn default() -> Self { + Self::new() + } +} + impl ArrayConcat { pub fn new() -> Self { Self { diff --git a/datafusion/functions-array/src/lib.rs b/datafusion/functions-array/src/lib.rs index 7c261f958bf0..5914736773b7 100644 --- a/datafusion/functions-array/src/lib.rs +++ b/datafusion/functions-array/src/lib.rs @@ -28,28 +28,28 @@ #[macro_use] pub mod macros; -mod array_has; -mod cardinality; -mod concat; -mod dimension; -mod empty; -mod except; -mod extract; -mod flatten; -mod length; -mod make_array; -mod position; -mod range; -mod remove; -mod repeat; -mod replace; -mod resize; -mod reverse; -mod rewrite; -mod set_ops; -mod sort; -mod string; -mod utils; +pub mod array_has; +pub mod cardinality; +pub mod concat; +pub mod dimension; +pub mod empty; +pub mod except; +pub mod extract; +pub mod flatten; +pub mod length; +pub mod make_array; +pub mod position; +pub mod range; +pub mod remove; +pub mod repeat; +pub mod replace; +pub mod resize; +pub mod reverse; +pub mod rewrite; +pub mod set_ops; +pub mod sort; +pub mod string; +pub mod utils; use datafusion_common::Result; use datafusion_execution::FunctionRegistry; diff --git a/datafusion/functions-array/src/make_array.rs b/datafusion/functions-array/src/make_array.rs index 8eaae09f28f5..0439a736ee42 100644 --- a/datafusion/functions-array/src/make_array.rs +++ b/datafusion/functions-array/src/make_array.rs @@ -48,6 +48,12 @@ pub struct MakeArray { aliases: Vec, } +impl Default for MakeArray { + fn default() -> Self { + Self::new() + } +} + impl MakeArray { pub fn new() -> Self { Self { diff --git a/datafusion/functions-array/src/udf.rs b/datafusion/functions-array/src/udf.rs new file mode 100644 index 000000000000..1462b3efad33 --- /dev/null +++ b/datafusion/functions-array/src/udf.rs @@ -0,0 +1,828 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ScalarUDFImpl`] definitions for array functions. + +use arrow::array::{NullArray, StringArray}; +use arrow::datatypes::DataType; +use arrow::datatypes::Field; +use arrow::datatypes::IntervalUnit::MonthDayNano; +use arrow_schema::DataType::{LargeUtf8, List, Utf8}; +use datafusion_common::exec_err; +use datafusion_common::plan_err; +use datafusion_common::Result; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::Expr; +use datafusion_expr::TypeSignature; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use std::any::Any; +use std::sync::Arc; + +// Create static instances of ScalarUDFs for each function +make_udf_function!(ArrayToString, + array_to_string, + array delimiter, // arg name + "converts each element to its text representation.", // doc + array_to_string_udf // internal function name +); +#[derive(Debug)] +pub struct ArrayToString { + signature: Signature, + aliases: Vec, +} + +impl ArrayToString { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec![ + String::from("array_to_string"), + String::from("list_to_string"), + String::from("array_join"), + String::from("list_join"), + ], + } + } +} + +impl ScalarUDFImpl for ArrayToString { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_to_string" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match arg_types[0] { + List(_) | LargeList(_) | FixedSizeList(_, _) => Utf8, + _ => { + return plan_err!("The array_to_string function can only accept List/LargeList/FixedSizeList."); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_to_string(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!(StringToArray, + string_to_array, + string delimiter null_string, // arg name + "splits a `string` based on a `delimiter` and returns an array of parts. Any parts matching the optional `null_string` will be replaced with `NULL`", // doc + string_to_array_udf // internal function name +); +#[derive(Debug)] +pub struct StringToArray { + signature: Signature, + aliases: Vec, +} + +impl StringToArray { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec![ + String::from("string_to_array"), + String::from("string_to_list"), + ], + } + } +} + +impl ScalarUDFImpl for StringToArray { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "string_to_array" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match arg_types[0] { + Utf8 | LargeUtf8 => { + List(Arc::new(Field::new("item", arg_types[0].clone(), true))) + } + _ => { + return plan_err!( + "The string_to_array function can only accept Utf8 or LargeUtf8." + ); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let mut args = ColumnarValue::values_to_arrays(args)?; + // Case: delimiter is NULL, needs to be handled as well. + if args[1].as_any().is::() { + args[1] = Arc::new(StringArray::new_null(args[1].len())); + }; + + match args[0].data_type() { + Utf8 => { + crate::kernels::string_to_array::(&args).map(ColumnarValue::Array) + } + LargeUtf8 => { + crate::kernels::string_to_array::(&args).map(ColumnarValue::Array) + } + other => { + exec_err!("unsupported type for string_to_array function as {other}") + } + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + Range, + range, + start stop step, + "create a list of values in the range between start and stop", + range_udf +); +#[derive(Debug)] +pub struct Range { + signature: Signature, + aliases: Vec, +} +impl Range { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![Int64]), + TypeSignature::Exact(vec![Int64, Int64]), + TypeSignature::Exact(vec![Int64, Int64, Int64]), + TypeSignature::Exact(vec![Date32, Date32, Interval(MonthDayNano)]), + ], + Volatility::Immutable, + ), + aliases: vec![String::from("range")], + } + } +} +impl ScalarUDFImpl for Range { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "range" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(List(Arc::new(Field::new( + "item", + arg_types[0].clone(), + true, + )))) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + match args[0].data_type() { + arrow::datatypes::DataType::Int64 => { + crate::kernels::gen_range(&args, false).map(ColumnarValue::Array) + } + arrow::datatypes::DataType::Date32 => { + crate::kernels::gen_range_date(&args, false).map(ColumnarValue::Array) + } + _ => { + exec_err!("unsupported type for range") + } + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + GenSeries, + gen_series, + start stop step, + "create a list of values in the range between start and stop, include upper bound", + gen_series_udf +); +#[derive(Debug)] +pub struct GenSeries { + signature: Signature, + aliases: Vec, +} +impl GenSeries { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![Int64]), + TypeSignature::Exact(vec![Int64, Int64]), + TypeSignature::Exact(vec![Int64, Int64, Int64]), + TypeSignature::Exact(vec![Date32, Date32, Interval(MonthDayNano)]), + ], + Volatility::Immutable, + ), + aliases: vec![String::from("generate_series")], + } + } +} +impl ScalarUDFImpl for GenSeries { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "generate_series" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(List(Arc::new(Field::new( + "item", + arg_types[0].clone(), + true, + )))) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + match args[0].data_type() { + arrow::datatypes::DataType::Int64 => { + crate::kernels::gen_range(&args, true).map(ColumnarValue::Array) + } + arrow::datatypes::DataType::Date32 => { + crate::kernels::gen_range_date(&args, true).map(ColumnarValue::Array) + } + _ => { + exec_err!("unsupported type for range") + } + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArrayDims, + array_dims, + array, + "returns an array of the array's dimensions.", + array_dims_udf +); + +#[derive(Debug)] +pub struct ArrayDims { + signature: Signature, + aliases: Vec, +} + +impl ArrayDims { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + aliases: vec!["array_dims".to_string(), "list_dims".to_string()], + } + } +} + +impl ScalarUDFImpl for ArrayDims { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_dims" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match arg_types[0] { + List(_) | LargeList(_) | FixedSizeList(_, _) => { + List(Arc::new(Field::new("item", UInt64, true))) + } + _ => { + return plan_err!("The array_dims function can only accept List/LargeList/FixedSizeList."); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_dims(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArraySort, + array_sort, + array desc null_first, + "returns sorted array.", + array_sort_udf +); + +#[derive(Debug)] +pub struct ArraySort { + signature: Signature, + aliases: Vec, +} + +impl ArraySort { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec!["array_sort".to_string(), "list_sort".to_string()], + } + } +} + +impl ScalarUDFImpl for ArraySort { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_sort" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + match &arg_types[0] { + List(field) | FixedSizeList(field, _) => Ok(List(Arc::new(Field::new( + "item", + field.data_type().clone(), + true, + )))), + LargeList(field) => Ok(LargeList(Arc::new(Field::new( + "item", + field.data_type().clone(), + true, + )))), + _ => exec_err!( + "Not reachable, data_type should be List, LargeList or FixedSizeList" + ), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_sort(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + Cardinality, + cardinality, + array, + "returns the total number of elements in the array.", + cardinality_udf +); + +impl Cardinality { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + aliases: vec![String::from("cardinality")], + } + } +} + +#[derive(Debug)] +pub struct Cardinality { + signature: Signature, + aliases: Vec, +} +impl ScalarUDFImpl for Cardinality { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "cardinality" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match arg_types[0] { + List(_) | LargeList(_) | FixedSizeList(_, _) => UInt64, + _ => { + return plan_err!("The cardinality function can only accept List/LargeList/FixedSizeList."); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::cardinality(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArrayNdims, + array_ndims, + array, + "returns the number of dimensions of the array.", + array_ndims_udf +); + +#[derive(Debug)] +pub struct ArrayNdims { + signature: Signature, + aliases: Vec, +} +impl ArrayNdims { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + aliases: vec![String::from("array_ndims"), String::from("list_ndims")], + } + } +} + +impl ScalarUDFImpl for ArrayNdims { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_ndims" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match arg_types[0] { + List(_) | LargeList(_) | FixedSizeList(_, _) => UInt64, + _ => { + return plan_err!("The array_ndims function can only accept List/LargeList/FixedSizeList."); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_ndims(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArrayEmpty, + array_empty, + array, + "returns true for an empty array or false for a non-empty array.", + array_empty_udf +); + +#[derive(Debug)] +pub struct ArrayEmpty { + signature: Signature, + aliases: Vec, +} +impl ArrayEmpty { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + aliases: vec![String::from("empty")], + } + } +} + +impl ScalarUDFImpl for ArrayEmpty { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "empty" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match arg_types[0] { + List(_) | LargeList(_) | FixedSizeList(_, _) => Boolean, + _ => { + return plan_err!("The array_empty function can only accept List/LargeList/FixedSizeList."); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_empty(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArrayRepeat, + array_repeat, + element count, // arg name + "returns an array containing element `count` times.", // doc + array_repeat_udf // internal function name +); +#[derive(Debug)] +pub struct ArrayRepeat { + signature: Signature, + aliases: Vec, +} + +impl ArrayRepeat { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec![String::from("array_repeat"), String::from("list_repeat")], + } + } +} + +impl ScalarUDFImpl for ArrayRepeat { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_repeat" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(List(Arc::new(Field::new( + "item", + arg_types[0].clone(), + true, + )))) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_repeat(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArrayLength, + array_length, + array, + "returns the length of the array dimension.", + array_length_udf +); + +#[derive(Debug)] +pub struct ArrayLength { + signature: Signature, + aliases: Vec, +} +impl ArrayLength { + pub fn new() -> Self { + Self { + signature: Signature::variadic_any(Volatility::Immutable), + aliases: vec![String::from("array_length"), String::from("list_length")], + } + } +} + +impl ScalarUDFImpl for ArrayLength { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_length" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + Ok(match arg_types[0] { + List(_) | LargeList(_) | FixedSizeList(_, _) => UInt64, + _ => { + return plan_err!("The array_length function can only accept List/LargeList/FixedSizeList."); + } + }) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_length(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + Flatten, + flatten, + array, + "flattens an array of arrays into a single array.", + flatten_udf +); + +#[derive(Debug)] +pub struct Flatten { + signature: Signature, + aliases: Vec, +} +impl Flatten { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + aliases: vec![String::from("flatten")], + } + } +} + +impl ScalarUDFImpl for Flatten { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "flatten" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + fn get_base_type(data_type: &DataType) -> Result { + match data_type { + List(field) | FixedSizeList(field, _) + if matches!(field.data_type(), List(_) | FixedSizeList(_, _)) => + { + get_base_type(field.data_type()) + } + LargeList(field) if matches!(field.data_type(), LargeList(_)) => { + get_base_type(field.data_type()) + } + Null | List(_) | LargeList(_) => Ok(data_type.to_owned()), + FixedSizeList(field, _) => Ok(List(field.clone())), + _ => exec_err!( + "Not reachable, data_type should be List, LargeList or FixedSizeList" + ), + } + } + + let data_type = get_base_type(&arg_types[0])?; + Ok(data_type) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::flatten(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +make_udf_function!( + ArrayDistinct, + array_distinct, + array, + "return distinct values from the array after removing duplicates.", + array_distinct_udf +); + +#[derive(Debug)] +pub struct ArrayDistinct { + signature: Signature, + aliases: Vec, +} + +impl crate::udf::ArrayDistinct { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + aliases: vec!["array_distinct".to_string(), "list_distinct".to_string()], + } + } +} + +impl ScalarUDFImpl for crate::udf::ArrayDistinct { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_distinct" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + use DataType::*; + match &arg_types[0] { + List(field) | FixedSizeList(field, _) => Ok(List(Arc::new(Field::new( + "item", + field.data_type().clone(), + true, + )))), + LargeList(field) => Ok(LargeList(Arc::new(Field::new( + "item", + field.data_type().clone(), + true, + )))), + _ => exec_err!( + "Not reachable, data_type should be List, LargeList or FixedSizeList" + ), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + crate::kernels::array_distinct(&args).map(ColumnarValue::Array) + } + + fn aliases(&self) -> &[String] { + &self.aliases + } +} diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 425ac207c33e..ef7d2c9b1892 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -107,3 +107,8 @@ required-features = ["datetime_expressions"] harness = false name = "to_char" required-features = ["datetime_expressions"] + +[[bench]] +harness = false +name = "substr_index" +required-features = ["unicode_expressions"] diff --git a/datafusion/functions/benches/substr_index.rs b/datafusion/functions/benches/substr_index.rs new file mode 100644 index 000000000000..bb9a5b809eee --- /dev/null +++ b/datafusion/functions/benches/substr_index.rs @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use std::sync::Arc; + +use arrow::array::{ArrayRef, Int64Array, StringArray}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::distributions::{Alphanumeric, Uniform}; +use rand::prelude::Distribution; +use rand::Rng; + +use datafusion_expr::ColumnarValue; +use datafusion_functions::unicode::substr_index; + +struct Filter { + dist: Dist, + test: Test, +} + +impl Distribution for Filter +where + Dist: Distribution, + Test: Fn(&T) -> bool, +{ + fn sample(&self, rng: &mut R) -> T { + loop { + let x = self.dist.sample(rng); + if (self.test)(&x) { + return x; + } + } + } +} + +fn data() -> (StringArray, StringArray, Int64Array) { + let dist = Filter { + dist: Uniform::new(-4, 5), + test: |x: &i64| x != &0, + }; + let mut rng = rand::thread_rng(); + let mut strings: Vec = vec![]; + let mut delimiters: Vec = vec![]; + let mut counts: Vec = vec![]; + + for _ in 0..1000 { + let length = rng.gen_range(20..50); + let text: String = (&mut rng) + .sample_iter(&Alphanumeric) + .take(length) + .map(char::from) + .collect(); + let char = rng.gen_range(0..text.len()); + let delimiter = &text.chars().nth(char).unwrap(); + let count = rng.sample(&dist); + + strings.push(text); + delimiters.push(delimiter.to_string()); + counts.push(count); + } + + ( + StringArray::from(strings), + StringArray::from(delimiters), + Int64Array::from(counts), + ) +} + +fn criterion_benchmark(c: &mut Criterion) { + c.bench_function("substr_index_array_array_1000", |b| { + let (strings, delimiters, counts) = data(); + let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef); + let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef); + let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef); + + let args = [strings, delimiters, counts]; + b.iter(|| { + black_box( + substr_index() + .invoke(&args) + .expect("substr_index should work on valid values"), + ) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs index b6c1b5eb9a38..d641389e0ae3 100644 --- a/datafusion/functions/src/core/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -51,10 +51,16 @@ use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility} /// select arrow_cast(column_x, 'Float64') /// ``` #[derive(Debug)] -pub(super) struct ArrowCastFunc { +pub struct ArrowCastFunc { signature: Signature, } +impl Default for ArrowCastFunc { + fn default() -> Self { + Self::new() + } +} + impl ArrowCastFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/core/arrowtypeof.rs b/datafusion/functions/src/core/arrowtypeof.rs index 89702d3267ec..cc5e7e619bd8 100644 --- a/datafusion/functions/src/core/arrowtypeof.rs +++ b/datafusion/functions/src/core/arrowtypeof.rs @@ -22,10 +22,16 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct ArrowTypeOfFunc { +pub struct ArrowTypeOfFunc { signature: Signature, } +impl Default for ArrowTypeOfFunc { + fn default() -> Self { + Self::new() + } +} + impl ArrowTypeOfFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index e6313423867a..b00b8ea553f2 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -25,10 +25,16 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct GetFieldFunc { +pub struct GetFieldFunc { signature: Signature, } +impl Default for GetFieldFunc { + fn default() -> Self { + Self::new() + } +} + impl GetFieldFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs index 85d2410251c5..0f6920ccffa9 100644 --- a/datafusion/functions/src/core/mod.rs +++ b/datafusion/functions/src/core/mod.rs @@ -17,14 +17,14 @@ //! "core" DataFusion functions -mod arrow_cast; -mod arrowtypeof; -mod getfield; -mod named_struct; -mod nullif; -mod nvl; -mod nvl2; -mod r#struct; +pub mod arrow_cast; +pub mod arrowtypeof; +pub mod getfield; +pub mod named_struct; +pub mod nullif; +pub mod nvl; +pub mod nvl2; +pub mod r#struct; // create UDFs make_udf_function!(arrow_cast::ArrowCastFunc, ARROW_CAST, arrow_cast); diff --git a/datafusion/functions/src/core/named_struct.rs b/datafusion/functions/src/core/named_struct.rs index 327a41baa741..8ccda977f3a4 100644 --- a/datafusion/functions/src/core/named_struct.rs +++ b/datafusion/functions/src/core/named_struct.rs @@ -74,10 +74,16 @@ fn named_struct_expr(args: &[ColumnarValue]) -> Result { } #[derive(Debug)] -pub(super) struct NamedStructFunc { +pub struct NamedStructFunc { signature: Signature, } +impl Default for NamedStructFunc { + fn default() -> Self { + Self::new() + } +} + impl NamedStructFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/core/nullif.rs b/datafusion/functions/src/core/nullif.rs index 1e903d7a881d..dc9696bd8d72 100644 --- a/datafusion/functions/src/core/nullif.rs +++ b/datafusion/functions/src/core/nullif.rs @@ -27,7 +27,7 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct NullIfFunc { +pub struct NullIfFunc { signature: Signature, } @@ -50,6 +50,12 @@ static SUPPORTED_NULLIF_TYPES: &[DataType] = &[ DataType::LargeUtf8, ]; +impl Default for NullIfFunc { + fn default() -> Self { + Self::new() + } +} + impl NullIfFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/core/nvl.rs b/datafusion/functions/src/core/nvl.rs index 76b037eb81ba..274e36fbdecc 100644 --- a/datafusion/functions/src/core/nvl.rs +++ b/datafusion/functions/src/core/nvl.rs @@ -23,7 +23,7 @@ use datafusion_common::{internal_err, Result}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct NVLFunc { +pub struct NVLFunc { signature: Signature, aliases: Vec, } @@ -47,6 +47,12 @@ static SUPPORTED_NVL_TYPES: &[DataType] = &[ DataType::LargeUtf8, ]; +impl Default for NVLFunc { + fn default() -> Self { + Self::new() + } +} + impl NVLFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs index a65657eaeafd..66b9ef566a78 100644 --- a/datafusion/functions/src/core/nvl2.rs +++ b/datafusion/functions/src/core/nvl2.rs @@ -23,10 +23,16 @@ use datafusion_common::{internal_err, plan_datafusion_err, Result}; use datafusion_expr::{utils, ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct NVL2Func { +pub struct NVL2Func { signature: Signature, } +impl Default for NVL2Func { + fn default() -> Self { + Self::new() + } +} + impl NVL2Func { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/core/struct.rs b/datafusion/functions/src/core/struct.rs index ac300e0abde3..9d4b2e4a0b8b 100644 --- a/datafusion/functions/src/core/struct.rs +++ b/datafusion/functions/src/core/struct.rs @@ -54,10 +54,16 @@ fn struct_expr(args: &[ColumnarValue]) -> Result { Ok(ColumnarValue::Array(array_struct(arrays.as_slice())?)) } #[derive(Debug)] -pub(super) struct StructFunc { +pub struct StructFunc { signature: Signature, } +impl Default for StructFunc { + fn default() -> Self { + Self::new() + } +} + impl StructFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/crypto/digest.rs b/datafusion/functions/src/crypto/digest.rs index c6556787cb9c..c9dd3c1f56a2 100644 --- a/datafusion/functions/src/crypto/digest.rs +++ b/datafusion/functions/src/crypto/digest.rs @@ -25,9 +25,15 @@ use datafusion_expr::{ use std::any::Any; #[derive(Debug)] -pub(super) struct DigestFunc { +pub struct DigestFunc { signature: Signature, } +impl Default for DigestFunc { + fn default() -> Self { + Self::new() + } +} + impl DigestFunc { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index 7b2936a37938..ccb6fbba80aa 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -23,9 +23,15 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct Md5Func { +pub struct Md5Func { signature: Signature, } +impl Default for Md5Func { + fn default() -> Self { + Self::new() + } +} + impl Md5Func { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/crypto/sha224.rs b/datafusion/functions/src/crypto/sha224.rs index ef0fae97cfa5..2795c4a25004 100644 --- a/datafusion/functions/src/crypto/sha224.rs +++ b/datafusion/functions/src/crypto/sha224.rs @@ -23,9 +23,15 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct SHA224Func { +pub struct SHA224Func { signature: Signature, } +impl Default for SHA224Func { + fn default() -> Self { + Self::new() + } +} + impl SHA224Func { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/crypto/sha256.rs b/datafusion/functions/src/crypto/sha256.rs index f763f925cc56..0a3f3b26e431 100644 --- a/datafusion/functions/src/crypto/sha256.rs +++ b/datafusion/functions/src/crypto/sha256.rs @@ -23,9 +23,15 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct SHA256Func { +pub struct SHA256Func { signature: Signature, } +impl Default for SHA256Func { + fn default() -> Self { + Self::new() + } +} + impl SHA256Func { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/crypto/sha384.rs b/datafusion/functions/src/crypto/sha384.rs index b382d42663be..c3f7845ce7bd 100644 --- a/datafusion/functions/src/crypto/sha384.rs +++ b/datafusion/functions/src/crypto/sha384.rs @@ -23,9 +23,15 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct SHA384Func { +pub struct SHA384Func { signature: Signature, } +impl Default for SHA384Func { + fn default() -> Self { + Self::new() + } +} + impl SHA384Func { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/crypto/sha512.rs b/datafusion/functions/src/crypto/sha512.rs index a852376fadd6..dc3bfac9d8bd 100644 --- a/datafusion/functions/src/crypto/sha512.rs +++ b/datafusion/functions/src/crypto/sha512.rs @@ -23,9 +23,15 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct SHA512Func { +pub struct SHA512Func { signature: Signature, } +impl Default for SHA512Func { + fn default() -> Self { + Self::new() + } +} + impl SHA512Func { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/datetime/current_date.rs b/datafusion/functions/src/datetime/current_date.rs index 5338234a8e49..8b180ff41b91 100644 --- a/datafusion/functions/src/datetime/current_date.rs +++ b/datafusion/functions/src/datetime/current_date.rs @@ -26,11 +26,17 @@ use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct CurrentDateFunc { +pub struct CurrentDateFunc { signature: Signature, aliases: Vec, } +impl Default for CurrentDateFunc { + fn default() -> Self { + Self::new() + } +} + impl CurrentDateFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/current_time.rs b/datafusion/functions/src/datetime/current_time.rs index b8a8aa2acb53..803759d4e904 100644 --- a/datafusion/functions/src/datetime/current_time.rs +++ b/datafusion/functions/src/datetime/current_time.rs @@ -26,10 +26,16 @@ use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct CurrentTimeFunc { +pub struct CurrentTimeFunc { signature: Signature, } +impl Default for CurrentTimeFunc { + fn default() -> Self { + Self::new() + } +} + impl CurrentTimeFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/date_bin.rs b/datafusion/functions/src/datetime/date_bin.rs index b7f20f68e89c..7f5d9bb5d921 100644 --- a/datafusion/functions/src/datetime/date_bin.rs +++ b/datafusion/functions/src/datetime/date_bin.rs @@ -40,10 +40,16 @@ use datafusion_expr::{ }; #[derive(Debug)] -pub(super) struct DateBinFunc { +pub struct DateBinFunc { signature: Signature, } +impl Default for DateBinFunc { + fn default() -> Self { + Self::new() + } +} + impl DateBinFunc { pub fn new() -> Self { let base_sig = |array_type: TimeUnit| { diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index b41f7e13cff2..111cdabe2bfb 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -39,11 +39,17 @@ use datafusion_expr::{ }; #[derive(Debug)] -pub(super) struct DatePartFunc { +pub struct DatePartFunc { signature: Signature, aliases: Vec, } +impl Default for DatePartFunc { + fn default() -> Self { + Self::new() + } +} + impl DatePartFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/date_trunc.rs b/datafusion/functions/src/datetime/date_trunc.rs index b4b25666516d..0414bf9c2a26 100644 --- a/datafusion/functions/src/datetime/date_trunc.rs +++ b/datafusion/functions/src/datetime/date_trunc.rs @@ -45,11 +45,17 @@ use datafusion_expr::{ }; #[derive(Debug)] -pub(super) struct DateTruncFunc { +pub struct DateTruncFunc { signature: Signature, aliases: Vec, } +impl Default for DateTruncFunc { + fn default() -> Self { + Self::new() + } +} + impl DateTruncFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/from_unixtime.rs b/datafusion/functions/src/datetime/from_unixtime.rs index f0d5016c0db9..d36ebe735ee7 100644 --- a/datafusion/functions/src/datetime/from_unixtime.rs +++ b/datafusion/functions/src/datetime/from_unixtime.rs @@ -25,10 +25,16 @@ use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct FromUnixtimeFunc { +pub struct FromUnixtimeFunc { signature: Signature, } +impl Default for FromUnixtimeFunc { + fn default() -> Self { + Self::new() + } +} + impl FromUnixtimeFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/make_date.rs b/datafusion/functions/src/datetime/make_date.rs index 8afb36538363..6aa72572bc4d 100644 --- a/datafusion/functions/src/datetime/make_date.rs +++ b/datafusion/functions/src/datetime/make_date.rs @@ -30,10 +30,16 @@ use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct MakeDateFunc { +pub struct MakeDateFunc { signature: Signature, } +impl Default for MakeDateFunc { + fn default() -> Self { + Self::new() + } +} + impl MakeDateFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/mod.rs b/datafusion/functions/src/datetime/mod.rs index a2dfc93b05a3..c6939976eb02 100644 --- a/datafusion/functions/src/datetime/mod.rs +++ b/datafusion/functions/src/datetime/mod.rs @@ -21,19 +21,19 @@ use std::sync::Arc; use datafusion_expr::ScalarUDF; -mod common; -mod current_date; -mod current_time; -mod date_bin; -mod date_part; -mod date_trunc; -mod from_unixtime; -mod make_date; -mod now; -mod to_char; -mod to_date; -mod to_timestamp; -mod to_unixtime; +pub mod common; +pub mod current_date; +pub mod current_time; +pub mod date_bin; +pub mod date_part; +pub mod date_trunc; +pub mod from_unixtime; +pub mod make_date; +pub mod now; +pub mod to_char; +pub mod to_date; +pub mod to_timestamp; +pub mod to_unixtime; // create UDFs make_udf_function!(current_date::CurrentDateFunc, CURRENT_DATE, current_date); diff --git a/datafusion/functions/src/datetime/now.rs b/datafusion/functions/src/datetime/now.rs index cc7979df0d86..b2221215b94b 100644 --- a/datafusion/functions/src/datetime/now.rs +++ b/datafusion/functions/src/datetime/now.rs @@ -26,10 +26,16 @@ use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct NowFunc { +pub struct NowFunc { signature: Signature, } +impl Default for NowFunc { + fn default() -> Self { + Self::new() + } +} + impl NowFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/to_char.rs b/datafusion/functions/src/datetime/to_char.rs index ef5c45a5ad9c..f2e5af978ca0 100644 --- a/datafusion/functions/src/datetime/to_char.rs +++ b/datafusion/functions/src/datetime/to_char.rs @@ -35,11 +35,17 @@ use datafusion_expr::{ }; #[derive(Debug)] -pub(super) struct ToCharFunc { +pub struct ToCharFunc { signature: Signature, aliases: Vec, } +impl Default for ToCharFunc { + fn default() -> Self { + Self::new() + } +} + impl ToCharFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 077ad8f087b3..e491c0b55508 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -26,10 +26,16 @@ use datafusion_common::{exec_err, internal_datafusion_err, Result}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct ToDateFunc { +pub struct ToDateFunc { signature: Signature, } +impl Default for ToDateFunc { + fn default() -> Self { + Self::new() + } +} + impl ToDateFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/to_timestamp.rs b/datafusion/functions/src/datetime/to_timestamp.rs index b9cf8f3c604b..a7bcca62944c 100644 --- a/datafusion/functions/src/datetime/to_timestamp.rs +++ b/datafusion/functions/src/datetime/to_timestamp.rs @@ -30,30 +30,36 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::datetime::common::*; #[derive(Debug)] -pub(super) struct ToTimestampFunc { +pub struct ToTimestampFunc { signature: Signature, } #[derive(Debug)] -pub(super) struct ToTimestampSecondsFunc { +pub struct ToTimestampSecondsFunc { signature: Signature, } #[derive(Debug)] -pub(super) struct ToTimestampMillisFunc { +pub struct ToTimestampMillisFunc { signature: Signature, } #[derive(Debug)] -pub(super) struct ToTimestampMicrosFunc { +pub struct ToTimestampMicrosFunc { signature: Signature, } #[derive(Debug)] -pub(super) struct ToTimestampNanosFunc { +pub struct ToTimestampNanosFunc { signature: Signature, } +impl Default for ToTimestampFunc { + fn default() -> Self { + Self::new() + } +} + impl ToTimestampFunc { pub fn new() -> Self { Self { @@ -62,6 +68,12 @@ impl ToTimestampFunc { } } +impl Default for ToTimestampSecondsFunc { + fn default() -> Self { + Self::new() + } +} + impl ToTimestampSecondsFunc { pub fn new() -> Self { Self { @@ -70,6 +82,12 @@ impl ToTimestampSecondsFunc { } } +impl Default for ToTimestampMillisFunc { + fn default() -> Self { + Self::new() + } +} + impl ToTimestampMillisFunc { pub fn new() -> Self { Self { @@ -78,6 +96,12 @@ impl ToTimestampMillisFunc { } } +impl Default for ToTimestampMicrosFunc { + fn default() -> Self { + Self::new() + } +} + impl ToTimestampMicrosFunc { pub fn new() -> Self { Self { @@ -86,6 +110,12 @@ impl ToTimestampMicrosFunc { } } +impl Default for ToTimestampNanosFunc { + fn default() -> Self { + Self::new() + } +} + impl ToTimestampNanosFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/datetime/to_unixtime.rs b/datafusion/functions/src/datetime/to_unixtime.rs index ed56cdf8d0af..396dadccb4b3 100644 --- a/datafusion/functions/src/datetime/to_unixtime.rs +++ b/datafusion/functions/src/datetime/to_unixtime.rs @@ -26,10 +26,16 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use super::to_timestamp::ToTimestampSecondsFunc; #[derive(Debug)] -pub(super) struct ToUnixtimeFunc { +pub struct ToUnixtimeFunc { signature: Signature, } +impl Default for ToUnixtimeFunc { + fn default() -> Self { + Self::new() + } +} + impl ToUnixtimeFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/encoding/inner.rs b/datafusion/functions/src/encoding/inner.rs index a49a64228d2f..d9ce299a2602 100644 --- a/datafusion/functions/src/encoding/inner.rs +++ b/datafusion/functions/src/encoding/inner.rs @@ -37,10 +37,16 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct EncodeFunc { +pub struct EncodeFunc { signature: Signature, } +impl Default for EncodeFunc { + fn default() -> Self { + Self::new() + } +} + impl EncodeFunc { pub fn new() -> Self { use DataType::*; @@ -91,10 +97,16 @@ impl ScalarUDFImpl for EncodeFunc { } #[derive(Debug)] -pub(super) struct DecodeFunc { +pub struct DecodeFunc { signature: Signature, } +impl Default for DecodeFunc { + fn default() -> Self { + Self::new() + } +} + impl DecodeFunc { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/encoding/mod.rs b/datafusion/functions/src/encoding/mod.rs index 9d9f07dff7ce..49f914a68774 100644 --- a/datafusion/functions/src/encoding/mod.rs +++ b/datafusion/functions/src/encoding/mod.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -mod inner; +pub mod inner; // create `encode` and `decode` UDFs make_udf_function!(inner::EncodeFunc, ENCODE, encode); diff --git a/datafusion/functions/src/macros.rs b/datafusion/functions/src/macros.rs index 4907d74fe941..c92cb27ef5bb 100644 --- a/datafusion/functions/src/macros.rs +++ b/datafusion/functions/src/macros.rs @@ -251,7 +251,112 @@ macro_rules! make_math_unary_udf { }; } -#[macro_export] +/// Macro to create a binary math UDF. +/// +/// A binary math function takes two arguments of types Float32 or Float64, +/// applies a binary floating function to the argument, and returns a value of the same type. +/// +/// $UDF: the name of the UDF struct that implements `ScalarUDFImpl` +/// $GNAME: a singleton instance of the UDF +/// $NAME: the name of the function +/// $BINARY_FUNC: the binary function to apply to the argument +/// $MONOTONIC_FUNC: the monotonicity of the function +macro_rules! make_math_binary_udf { + ($UDF:ident, $GNAME:ident, $NAME:ident, $BINARY_FUNC:ident, $MONOTONICITY:expr) => { + make_udf_function!($NAME::$UDF, $GNAME, $NAME); + + mod $NAME { + use arrow::array::{ArrayRef, Float32Array, Float64Array}; + use arrow::datatypes::DataType; + use datafusion_common::{exec_err, DataFusionError, Result}; + use datafusion_expr::TypeSignature::*; + use datafusion_expr::{ + ColumnarValue, FuncMonotonicity, ScalarUDFImpl, Signature, Volatility, + }; + use std::any::Any; + use std::sync::Arc; + + #[derive(Debug)] + pub struct $UDF { + signature: Signature, + } + + impl $UDF { + pub fn new() -> Self { + use DataType::*; + Self { + signature: Signature::one_of( + vec![ + Exact(vec![Float32, Float32]), + Exact(vec![Float64, Float64]), + ], + Volatility::Immutable, + ), + } + } + } + + impl ScalarUDFImpl for $UDF { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + stringify!($NAME) + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + let arg_type = &arg_types[0]; + + match arg_type { + DataType::Float32 => Ok(DataType::Float32), + // For other types (possible values float64/null/int), use Float64 + _ => Ok(DataType::Float64), + } + } + + fn monotonicity(&self) -> Result> { + Ok($MONOTONICITY) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let args = ColumnarValue::values_to_arrays(args)?; + + let arr: ArrayRef = match args[0].data_type() { + DataType::Float64 => Arc::new(make_function_inputs2!( + &args[0], + &args[1], + "y", + "x", + Float64Array, + { f64::$BINARY_FUNC } + )), + + DataType::Float32 => Arc::new(make_function_inputs2!( + &args[0], + &args[1], + "y", + "x", + Float32Array, + { f32::$BINARY_FUNC } + )), + other => { + return exec_err!( + "Unsupported data type {other:?} for function {}", + self.name() + ) + } + }; + Ok(ColumnarValue::Array(arr)) + } + } + } + }; +} + macro_rules! make_function_inputs2 { ($ARG1: expr, $ARG2: expr, $NAME1:expr, $NAME2: expr, $ARRAY_TYPE:ident, $FUNC: block) => {{ let arg1 = downcast_arg!($ARG1, $NAME1, $ARRAY_TYPE); diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs index 8aa48460ff69..e05dc8665285 100644 --- a/datafusion/functions/src/math/abs.rs +++ b/datafusion/functions/src/math/abs.rs @@ -104,10 +104,16 @@ fn create_abs_function(input_data_type: &DataType) -> Result } } #[derive(Debug)] -pub(super) struct AbsFunc { +pub struct AbsFunc { signature: Signature, } +impl Default for AbsFunc { + fn default() -> Self { + Self::new() + } +} + impl AbsFunc { pub fn new() -> Self { Self { diff --git a/datafusion/functions/src/math/atan2.rs b/datafusion/functions/src/math/atan2.rs deleted file mode 100644 index b090c6c454fd..000000000000 --- a/datafusion/functions/src/math/atan2.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Math function: `atan2()`. - -use arrow::array::{ArrayRef, Float32Array, Float64Array}; -use arrow::datatypes::DataType; -use datafusion_common::DataFusionError; -use datafusion_common::{exec_err, Result}; -use datafusion_expr::ColumnarValue; -use datafusion_expr::TypeSignature::*; -use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; -use std::any::Any; -use std::sync::Arc; - -use crate::make_function_inputs2; -use crate::utils::make_scalar_function; - -#[derive(Debug)] -pub(super) struct Atan2 { - signature: Signature, -} - -impl Atan2 { - pub fn new() -> Self { - use DataType::*; - Self { - signature: Signature::one_of( - vec![Exact(vec![Float32, Float32]), Exact(vec![Float64, Float64])], - Volatility::Immutable, - ), - } - } -} - -impl ScalarUDFImpl for Atan2 { - fn as_any(&self) -> &dyn Any { - self - } - fn name(&self) -> &str { - "atan2" - } - - fn signature(&self) -> &Signature { - &self.signature - } - - fn return_type(&self, arg_types: &[DataType]) -> Result { - use self::DataType::*; - match &arg_types[0] { - Float32 => Ok(Float32), - _ => Ok(Float64), - } - } - - fn invoke(&self, args: &[ColumnarValue]) -> Result { - make_scalar_function(atan2, vec![])(args) - } -} - -/// Atan2 SQL function -pub fn atan2(args: &[ArrayRef]) -> Result { - match args[0].data_type() { - DataType::Float64 => Ok(Arc::new(make_function_inputs2!( - &args[0], - &args[1], - "y", - "x", - Float64Array, - { f64::atan2 } - )) as ArrayRef), - - DataType::Float32 => Ok(Arc::new(make_function_inputs2!( - &args[0], - &args[1], - "y", - "x", - Float32Array, - { f32::atan2 } - )) as ArrayRef), - - other => exec_err!("Unsupported data type {other:?} for function atan2"), - } -} - -#[cfg(test)] -mod test { - use super::*; - use datafusion_common::cast::{as_float32_array, as_float64_array}; - - #[test] - fn test_atan2_f64() { - let args: Vec = vec![ - Arc::new(Float64Array::from(vec![2.0, -3.0, 4.0, -5.0])), // y - Arc::new(Float64Array::from(vec![1.0, 2.0, -3.0, -4.0])), // x - ]; - - let result = atan2(&args).expect("failed to initialize function atan2"); - let floats = - as_float64_array(&result).expect("failed to initialize function atan2"); - - assert_eq!(floats.len(), 4); - assert_eq!(floats.value(0), (2.0_f64).atan2(1.0)); - assert_eq!(floats.value(1), (-3.0_f64).atan2(2.0)); - assert_eq!(floats.value(2), (4.0_f64).atan2(-3.0)); - assert_eq!(floats.value(3), (-5.0_f64).atan2(-4.0)); - } - - #[test] - fn test_atan2_f32() { - let args: Vec = vec![ - Arc::new(Float32Array::from(vec![2.0, -3.0, 4.0, -5.0])), // y - Arc::new(Float32Array::from(vec![1.0, 2.0, -3.0, -4.0])), // x - ]; - - let result = atan2(&args).expect("failed to initialize function atan2"); - let floats = - as_float32_array(&result).expect("failed to initialize function atan2"); - - assert_eq!(floats.len(), 4); - assert_eq!(floats.value(0), (2.0_f32).atan2(1.0)); - assert_eq!(floats.value(1), (-3.0_f32).atan2(2.0)); - assert_eq!(floats.value(2), (4.0_f32).atan2(-3.0)); - assert_eq!(floats.value(3), (-5.0_f32).atan2(-4.0)); - } -} diff --git a/datafusion/functions/src/math/mod.rs b/datafusion/functions/src/math/mod.rs index 2ee1fffa1625..a3ec2e3b90ce 100644 --- a/datafusion/functions/src/math/mod.rs +++ b/datafusion/functions/src/math/mod.rs @@ -17,14 +17,12 @@ //! "math" DataFusion functions -mod abs; -mod atan2; -mod nans; +pub mod abs; +pub mod nans; // Create UDFs make_udf_function!(nans::IsNanFunc, ISNAN, isnan); make_udf_function!(abs::AbsFunc, ABS, abs); -make_udf_function!(atan2::Atan2, ATAN2, atan2); make_math_unary_udf!(Log2Func, LOG2, log2, log2, Some(vec![Some(true)])); make_math_unary_udf!(Log10Func, LOG10, log10, log10, Some(vec![Some(true)])); @@ -39,6 +37,13 @@ make_math_unary_udf!(AtanhFunc, ATANH, atanh, atanh, Some(vec![Some(true)])); make_math_unary_udf!(AsinhFunc, ASINH, asinh, asinh, Some(vec![Some(true)])); make_math_unary_udf!(AcoshFunc, ACOSH, acosh, acosh, Some(vec![Some(true)])); make_math_unary_udf!(AtanFunc, ATAN, atan, atan, Some(vec![Some(true)])); +make_math_binary_udf!(Atan2, ATAN2, atan2, atan2, Some(vec![Some(true)])); + +make_math_unary_udf!(RadiansFunc, RADIANS, radians, to_radians, None); +make_math_unary_udf!(SignumFunc, SIGNUM, signum, signum, None); +make_math_unary_udf!(SinFunc, SIN, sin, sin, None); +make_math_unary_udf!(SinhFunc, SINH, sinh, sinh, None); +make_math_unary_udf!(SqrtFunc, SQRT, sqrt, sqrt, None); // Export the functions out of this package, both as expr_fn as well as a list of functions export_functions!( @@ -67,5 +72,10 @@ export_functions!( (asinh, num, "returns inverse hyperbolic sine"), (acosh, num, "returns inverse hyperbolic cosine"), (atan, num, "returns inverse tangent"), - (atan2, y x, "returns inverse tangent of a division given in the argument") + (atan2, y x, "returns inverse tangent of a division given in the argument"), + (radians, num, "converts degrees to radians"), + (signum, num, "sign of the argument (-1, 0, +1)"), + (sin, num, "sine"), + (sinh, num, "hyperbolic sine"), + (sqrt, num, "square root of a number") ); diff --git a/datafusion/functions/src/math/nans.rs b/datafusion/functions/src/math/nans.rs index 3f3d7d197c33..2bd704a7de2e 100644 --- a/datafusion/functions/src/math/nans.rs +++ b/datafusion/functions/src/math/nans.rs @@ -28,10 +28,16 @@ use std::any::Any; use std::sync::Arc; #[derive(Debug)] -pub(super) struct IsNanFunc { +pub struct IsNanFunc { signature: Signature, } +impl Default for IsNanFunc { + fn default() -> Self { + Self::new() + } +} + impl IsNanFunc { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 94dd7529e8ff..09b96a28c107 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -32,9 +32,15 @@ use std::any::Any; use std::sync::Arc; #[derive(Debug)] -pub(super) struct RegexpLikeFunc { +pub struct RegexpLikeFunc { signature: Signature, } +impl Default for RegexpLikeFunc { + fn default() -> Self { + Self::new() + } +} + impl RegexpLikeFunc { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 5178188424fb..73228e608143 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -33,9 +33,15 @@ use std::any::Any; use std::sync::Arc; #[derive(Debug)] -pub(super) struct RegexpMatchFunc { +pub struct RegexpMatchFunc { signature: Signature, } +impl Default for RegexpMatchFunc { + fn default() -> Self { + Self::new() + } +} + impl RegexpMatchFunc { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index d90996e04b3f..4e21883c9752 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -38,9 +38,15 @@ use std::collections::HashMap; use std::sync::Arc; use std::sync::OnceLock; #[derive(Debug)] -pub(super) struct RegexpReplaceFunc { +pub struct RegexpReplaceFunc { signature: Signature, } +impl Default for RegexpReplaceFunc { + fn default() -> Self { + Self::new() + } +} + impl RegexpReplaceFunc { pub fn new() -> Self { use DataType::*; diff --git a/datafusion/functions/src/string/ascii.rs b/datafusion/functions/src/string/ascii.rs index 9a07f4c19cf1..15a3c2391ac6 100644 --- a/datafusion/functions/src/string/ascii.rs +++ b/datafusion/functions/src/string/ascii.rs @@ -44,7 +44,7 @@ pub fn ascii(args: &[ArrayRef]) -> Result { } #[derive(Debug)] -pub(super) struct AsciiFunc { +pub struct AsciiFunc { signature: Signature, } impl AsciiFunc { diff --git a/datafusion/functions/src/string/bit_length.rs b/datafusion/functions/src/string/bit_length.rs index 6a200471d42d..17c49216553b 100644 --- a/datafusion/functions/src/string/bit_length.rs +++ b/datafusion/functions/src/string/bit_length.rs @@ -27,7 +27,7 @@ use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::utils::utf8_to_int_type; #[derive(Debug)] -pub(super) struct BitLengthFunc { +pub struct BitLengthFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/btrim.rs b/datafusion/functions/src/string/btrim.rs index 573a23d07021..b0a85eab6d83 100644 --- a/datafusion/functions/src/string/btrim.rs +++ b/datafusion/functions/src/string/btrim.rs @@ -35,7 +35,7 @@ fn btrim(args: &[ArrayRef]) -> Result { } #[derive(Debug)] -pub(super) struct BTrimFunc { +pub struct BTrimFunc { signature: Signature, aliases: Vec, } diff --git a/datafusion/functions/src/string/chr.rs b/datafusion/functions/src/string/chr.rs index d1f8dc398a2b..21d79cf6b0f1 100644 --- a/datafusion/functions/src/string/chr.rs +++ b/datafusion/functions/src/string/chr.rs @@ -61,7 +61,7 @@ pub fn chr(args: &[ArrayRef]) -> Result { } #[derive(Debug)] -pub(super) struct ChrFunc { +pub struct ChrFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs index 8f497e73e393..390b0d72bee2 100644 --- a/datafusion/functions/src/string/levenshtein.rs +++ b/datafusion/functions/src/string/levenshtein.rs @@ -30,7 +30,7 @@ use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct LevenshteinFunc { +pub struct LevenshteinFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index 327772bd808d..a1eff7042211 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -27,7 +27,7 @@ use crate::string::common::handle; use crate::utils::utf8_to_str_type; #[derive(Debug)] -pub(super) struct LowerFunc { +pub struct LowerFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index e6926e5bd56e..ad86259d0d7e 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -35,7 +35,7 @@ fn ltrim(args: &[ArrayRef]) -> Result { } #[derive(Debug)] -pub(super) struct LtrimFunc { +pub struct LtrimFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/octet_length.rs b/datafusion/functions/src/string/octet_length.rs index 639bf6cb48a9..bdd262b7e37e 100644 --- a/datafusion/functions/src/string/octet_length.rs +++ b/datafusion/functions/src/string/octet_length.rs @@ -27,7 +27,7 @@ use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::utils::utf8_to_int_type; #[derive(Debug)] -pub(super) struct OctetLengthFunc { +pub struct OctetLengthFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/overlay.rs b/datafusion/functions/src/string/overlay.rs index 8b9cc03afc4d..3f92a73c1af9 100644 --- a/datafusion/functions/src/string/overlay.rs +++ b/datafusion/functions/src/string/overlay.rs @@ -30,7 +30,7 @@ use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct OverlayFunc { +pub struct OverlayFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index f4319af0a5c4..77521120d9d8 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -30,7 +30,7 @@ use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct RepeatFunc { +pub struct RepeatFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index e869ac205440..01a3762acaf4 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -30,7 +30,7 @@ use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct ReplaceFunc { +pub struct ReplaceFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/rtrim.rs b/datafusion/functions/src/string/rtrim.rs index d04d15ce8847..607e647b2615 100644 --- a/datafusion/functions/src/string/rtrim.rs +++ b/datafusion/functions/src/string/rtrim.rs @@ -35,7 +35,7 @@ fn rtrim(args: &[ArrayRef]) -> Result { } #[derive(Debug)] -pub(super) struct RtrimFunc { +pub struct RtrimFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/split_part.rs b/datafusion/functions/src/string/split_part.rs index 0aa968a1ef5b..4396386afff5 100644 --- a/datafusion/functions/src/string/split_part.rs +++ b/datafusion/functions/src/string/split_part.rs @@ -30,7 +30,7 @@ use datafusion_expr::{ScalarUDFImpl, Signature}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct SplitPartFunc { +pub struct SplitPartFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/starts_with.rs b/datafusion/functions/src/string/starts_with.rs index f1b03907f8d8..edbf5c9217a7 100644 --- a/datafusion/functions/src/string/starts_with.rs +++ b/datafusion/functions/src/string/starts_with.rs @@ -40,7 +40,7 @@ pub fn starts_with(args: &[ArrayRef]) -> Result { } #[derive(Debug)] -pub(super) struct StartsWithFunc { +pub struct StartsWithFunc { signature: Signature, } impl StartsWithFunc { diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs index ab320c68d493..feedeb47f564 100644 --- a/datafusion/functions/src/string/to_hex.rs +++ b/datafusion/functions/src/string/to_hex.rs @@ -60,7 +60,7 @@ where } #[derive(Debug)] -pub(super) struct ToHexFunc { +pub struct ToHexFunc { signature: Signature, } impl ToHexFunc { diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index 066174abf277..c21824d30d53 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -24,7 +24,7 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; #[derive(Debug)] -pub(super) struct UpperFunc { +pub struct UpperFunc { signature: Signature, } diff --git a/datafusion/functions/src/string/uuid.rs b/datafusion/functions/src/string/uuid.rs index 791ad6d3c4f3..c68871d42e9f 100644 --- a/datafusion/functions/src/string/uuid.rs +++ b/datafusion/functions/src/string/uuid.rs @@ -29,7 +29,7 @@ use datafusion_expr::{ColumnarValue, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; #[derive(Debug)] -pub(super) struct UuidFunc { +pub struct UuidFunc { signature: Signature, } diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index 51331bf9a586..7e2723771ff2 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -28,7 +28,7 @@ use std::any::Any; use std::sync::Arc; #[derive(Debug)] -pub(super) struct CharacterLengthFunc { +pub struct CharacterLengthFunc { signature: Signature, aliases: Vec, } diff --git a/datafusion/functions/src/unicode/find_in_set.rs b/datafusion/functions/src/unicode/find_in_set.rs index 7e0306d49454..fc45f897c5f4 100644 --- a/datafusion/functions/src/unicode/find_in_set.rs +++ b/datafusion/functions/src/unicode/find_in_set.rs @@ -31,7 +31,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::utils::{make_scalar_function, utf8_to_int_type}; #[derive(Debug)] -pub(super) struct FindInSetFunc { +pub struct FindInSetFunc { signature: Signature, } diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index 473589fdc8aa..24ea2d5a8f25 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -31,7 +31,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct LeftFunc { +pub struct LeftFunc { signature: Signature, } diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index 76a8e68cca25..47208903bcef 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -29,7 +29,7 @@ use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct LPadFunc { +pub struct LPadFunc { signature: Signature, } diff --git a/datafusion/functions/src/unicode/reverse.rs b/datafusion/functions/src/unicode/reverse.rs index 42ca6e0d17c3..6b24c2336810 100644 --- a/datafusion/functions/src/unicode/reverse.rs +++ b/datafusion/functions/src/unicode/reverse.rs @@ -28,7 +28,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct ReverseFunc { +pub struct ReverseFunc { signature: Signature, } diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index d1bd976342b2..dddbf31e721b 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -31,7 +31,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct RightFunc { +pub struct RightFunc { signature: Signature, } diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 070278c90b2f..8946f07006b7 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -29,7 +29,7 @@ use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] -pub(super) struct RPadFunc { +pub struct RPadFunc { signature: Signature, } diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs index 1e8bfa37d40e..4ebdd9d58623 100644 --- a/datafusion/functions/src/unicode/strpos.rs +++ b/datafusion/functions/src/unicode/strpos.rs @@ -31,7 +31,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::utils::{make_scalar_function, utf8_to_int_type}; #[derive(Debug)] -pub(super) struct StrposFunc { +pub struct StrposFunc { signature: Signature, aliases: Vec, } diff --git a/datafusion/functions/src/unicode/substr.rs b/datafusion/functions/src/unicode/substr.rs index 403157e2a85a..260937a01a74 100644 --- a/datafusion/functions/src/unicode/substr.rs +++ b/datafusion/functions/src/unicode/substr.rs @@ -30,7 +30,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct SubstrFunc { +pub struct SubstrFunc { signature: Signature, } diff --git a/datafusion/functions/src/unicode/substrindex.rs b/datafusion/functions/src/unicode/substrindex.rs index 77e8116fff4c..d00108a68fc9 100644 --- a/datafusion/functions/src/unicode/substrindex.rs +++ b/datafusion/functions/src/unicode/substrindex.rs @@ -29,7 +29,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct SubstrIndexFunc { +pub struct SubstrIndexFunc { signature: Signature, aliases: Vec, } diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index bc1836700304..25daf8738b21 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -31,7 +31,7 @@ use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; use crate::utils::{make_scalar_function, utf8_to_str_type}; #[derive(Debug)] -pub(super) struct TranslateFunc { +pub struct TranslateFunc { signature: Signature, } diff --git a/datafusion/optimizer/src/analyzer/inline_table_scan.rs b/datafusion/optimizer/src/analyzer/inline_table_scan.rs index b21ec851dfcd..88202ffd21f1 100644 --- a/datafusion/optimizer/src/analyzer/inline_table_scan.rs +++ b/datafusion/optimizer/src/analyzer/inline_table_scan.rs @@ -23,7 +23,7 @@ use crate::analyzer::AnalyzerRule; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion_common::Result; +use datafusion_common::{Column, Result}; use datafusion_expr::expr::{Exists, InSubquery}; use datafusion_expr::{ logical_plan::LogicalPlan, Expr, Filter, LogicalPlanBuilder, TableScan, @@ -119,9 +119,9 @@ fn generate_projection_expr( let mut exprs = vec![]; if let Some(projection) = projection { for i in projection { - exprs.push(Expr::Column( - sub_plan.schema().fields()[*i].qualified_column(), - )); + exprs.push(Expr::Column(Column::from( + sub_plan.schema().qualified_field(*i), + ))); } } else { exprs.push(Expr::Wildcard { qualifier: None }); diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index c76c1c8a7bd0..b7b7c4f20e4a 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -756,9 +756,9 @@ mod test { }; use crate::test::assert_analyzed_plan_eq; - use arrow::datatypes::{DataType, TimeUnit}; + use arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion_common::tree_node::{TransformedResult, TreeNode}; - use datafusion_common::{DFField, DFSchema, DFSchemaRef, Result, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::expr::{self, InSubquery, Like, ScalarFunction}; use datafusion_expr::logical_plan::{EmptyRelation, Projection}; use datafusion_expr::{ @@ -781,8 +781,8 @@ mod test { Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: Arc::new( - DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", data_type, true)], + DFSchema::from_unqualifed_fields( + vec![Field::new("a", data_type, true)].into(), std::collections::HashMap::new(), ) .unwrap(), @@ -1042,12 +1042,8 @@ mod test { let expr = col("a").in_list(vec![lit(1_i32), lit(4_i8), lit(8_i64)], false); let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified( - "a", - DataType::Decimal128(12, 4), - true, - )], + schema: Arc::new(DFSchema::from_unqualifed_fields( + vec![Field::new("a", DataType::Decimal128(12, 4), true)].into(), std::collections::HashMap::new(), )?), })); @@ -1251,8 +1247,8 @@ mod test { #[test] fn test_type_coercion_rewrite() -> Result<()> { // gt - let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + let schema = Arc::new(DFSchema::from_unqualifed_fields( + vec![Field::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema }; @@ -1262,8 +1258,8 @@ mod test { assert_eq!(expected, result); // eq - let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + let schema = Arc::new(DFSchema::from_unqualifed_fields( + vec![Field::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema }; @@ -1273,8 +1269,8 @@ mod test { assert_eq!(expected, result); // lt - let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + let schema = Arc::new(DFSchema::from_unqualifed_fields( + vec![Field::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema }; @@ -1346,26 +1342,27 @@ mod test { #[test] fn test_case_expression_coercion() -> Result<()> { - let schema = Arc::new(DFSchema::new_with_metadata( + let schema = Arc::new(DFSchema::from_unqualifed_fields( vec![ - DFField::new_unqualified("boolean", DataType::Boolean, true), - DFField::new_unqualified("integer", DataType::Int32, true), - DFField::new_unqualified("float", DataType::Float32, true), - DFField::new_unqualified( + Field::new("boolean", DataType::Boolean, true), + Field::new("integer", DataType::Int32, true), + Field::new("float", DataType::Float32, true), + Field::new( "timestamp", DataType::Timestamp(TimeUnit::Nanosecond, None), true, ), - DFField::new_unqualified("date", DataType::Date32, true), - DFField::new_unqualified( + Field::new("date", DataType::Date32, true), + Field::new( "interval", DataType::Interval(arrow::datatypes::IntervalUnit::MonthDayNano), true, ), - DFField::new_unqualified("binary", DataType::Binary, true), - DFField::new_unqualified("string", DataType::Utf8, true), - DFField::new_unqualified("decimal", DataType::Decimal128(10, 10), true), - ], + Field::new("binary", DataType::Binary, true), + Field::new("string", DataType::Utf8, true), + Field::new("decimal", DataType::Decimal128(10, 10), true), + ] + .into(), std::collections::HashMap::new(), )?); diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 25c25c63f0b7..77613aa66293 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -24,13 +24,13 @@ use std::sync::Arc; use crate::utils::is_volatile_expression; use crate::{utils, OptimizerConfig, OptimizerRule}; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Field}; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, TreeNodeVisitor, }; use datafusion_common::{ - internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, + internal_err, qualified_name, Column, DFSchema, DFSchemaRef, DataFusionError, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; @@ -331,8 +331,10 @@ impl CommonSubexprEliminate { proj_exprs.push(Expr::Column(Column::from_name(name))); } else { let id = ExprSet::expr_identifier(&expr_rewritten); - let out_name = - expr_rewritten.to_field(&new_input_schema)?.qualified_name(); + let (qualifier, field) = + expr_rewritten.to_field(&new_input_schema)?; + let out_name = qualified_name(qualifier.as_ref(), field.name()); + agg_exprs.push(expr_rewritten.alias(&id)); proj_exprs .push(Expr::Column(Column::from_name(id)).alias(out_name)); @@ -469,7 +471,7 @@ fn build_common_expr_project_plan( match expr_set.get(&id) { Some((expr, _, data_type, symbol)) => { // todo: check `nullable` - let field = DFField::new_unqualified(&id, data_type.clone(), true); + let field = Field::new(&id, data_type.clone(), true); fields_set.insert(field.name().to_owned()); project_exprs.push(expr.clone().alias(symbol.as_str())); } @@ -479,9 +481,9 @@ fn build_common_expr_project_plan( } } - for field in input.schema().fields() { - if fields_set.insert(field.qualified_name()) { - project_exprs.push(Expr::Column(field.qualified_column())); + for (qualifier, field) in input.schema().iter() { + if fields_set.insert(qualified_name(qualifier, field.name())) { + project_exprs.push(Expr::Column(Column::from((qualifier, field.as_ref())))); } } @@ -500,9 +502,8 @@ fn build_recover_project_plan( input: LogicalPlan, ) -> Result { let col_exprs = schema - .fields() .iter() - .map(|field| Expr::Column(field.qualified_column())) + .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field.as_ref())))) .collect(); Ok(LogicalPlan::Projection(Projection::try_new( col_exprs, @@ -517,10 +518,14 @@ fn extract_expressions( ) -> Result<()> { if let Expr::GroupingSet(groupings) = expr { for e in groupings.distinct_expr() { - result.push(Expr::Column(e.to_field(schema)?.qualified_column())) + let (qualifier, field) = e.to_field(schema)?; + let col = Column::new(qualifier, field.name()); + result.push(Expr::Column(col)) } } else { - result.push(Expr::Column(expr.to_field(schema)?.qualified_column())); + let (qualifier, field) = expr.to_field(schema)?; + let col = Column::new(qualifier, field.name()); + result.push(Expr::Column(col)); } Ok(()) @@ -1037,8 +1042,8 @@ mod test { build_common_expr_project_plan(project, affected_id, &expr_set_2).unwrap(); let mut field_set = BTreeSet::new(); - for field in project_2.schema().fields() { - assert!(field_set.insert(field.qualified_name())); + for name in project_2.schema().field_names() { + assert!(field_set.insert(name)); } } @@ -1104,8 +1109,8 @@ mod test { build_common_expr_project_plan(project, affected_id, &expr_set_2).unwrap(); let mut field_set = BTreeSet::new(); - for field in project_2.schema().fields() { - assert!(field_set.insert(field.qualified_name())); + for name in project_2.schema().field_names() { + assert!(field_set.insert(name)); } } @@ -1181,12 +1186,13 @@ mod test { fn test_extract_expressions_from_grouping_set() -> Result<()> { let mut result = Vec::with_capacity(3); let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("c")]]); - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), - DFField::new_unqualified("c", DataType::Int32, false), - ], + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Int32, false), + ] + .into(), HashMap::default(), )?; extract_expressions(&grouping, &schema, &mut result)?; @@ -1199,11 +1205,12 @@ mod test { fn test_extract_expressions_from_grouping_set_with_identical_expr() -> Result<()> { let mut result = Vec::with_capacity(2); let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("a")]]); - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), - ], + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ] + .into(), HashMap::default(), )?; extract_expressions(&grouping, &schema, &mut result)?; @@ -1215,8 +1222,8 @@ mod test { #[test] fn test_extract_expressions_from_col() -> Result<()> { let mut result = Vec::with_capacity(1); - let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int32, false)], + let schema = DFSchema::from_unqualifed_fields( + vec![Field::new("a", DataType::Int32, false)].into(), HashMap::default(), )?; extract_expressions(&col("a"), &schema, &mut result)?; diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index b942f187c331..c40a9bb704eb 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -665,10 +665,9 @@ fn outer_columns_helper_multi<'a>( /// /// A vector of `Expr::Column` expressions residing at `indices` of the `input_schema`. fn get_required_exprs(input_schema: &Arc, indices: &[usize]) -> Vec { - let fields = input_schema.fields(); indices .iter() - .map(|&idx| Expr::Column(fields[idx].qualified_column())) + .map(|&idx| Expr::Column(Column::from(input_schema.qualified_field(idx)))) .collect() } diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index fe63766fc265..3153f72d7ee7 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -467,7 +467,7 @@ mod tests { use crate::test::test_table_scan; use crate::{OptimizerConfig, OptimizerContext, OptimizerRule}; - use datafusion_common::{plan_err, DFField, DFSchema, DFSchemaRef, Result}; + use datafusion_common::{plan_err, DFSchema, DFSchemaRef, Result}; use datafusion_expr::logical_plan::EmptyRelation; use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, Projection}; @@ -509,14 +509,18 @@ mod tests { let err = opt.optimize(&plan, &config, &observe).unwrap_err(); assert_eq!( "Optimizer rule 'get table_scan rule' failed\ncaused by\nget table_scan rule\ncaused by\n\ - Internal error: Failed due to a difference in schemas, \ - original schema: DFSchema { fields: [\ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }], \ - metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }, \ - new schema: DFSchema { fields: [], metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }.\ - \nThis was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker", + Internal error: Failed due to a difference in schemas, original schema: \ + DFSchema { inner: Schema { fields: \ + [Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, \ + field_qualifiers: [Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" })], \ + functional_dependencies: FunctionalDependencies { deps: [] } }, \ + new schema: DFSchema { inner: Schema { \ + fields: [], metadata: {} }, \ + field_qualifiers: [], \ + functional_dependencies: FunctionalDependencies { deps: [] } }.\n\ + This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker", err.strip_backtrace() ); } @@ -610,19 +614,14 @@ mod tests { fn add_metadata_to_fields(schema: &DFSchema) -> DFSchemaRef { let new_fields = schema - .fields() .iter() .enumerate() - .map(|(i, f)| { + .map(|(i, (qualifier, field))| { let metadata = [("key".into(), format!("value {i}"))].into_iter().collect(); - let new_arrow_field = f.field().as_ref().clone().with_metadata(metadata); - if let Some(qualifier) = f.qualifier() { - DFField::from_qualified(qualifier.clone(), new_arrow_field) - } else { - DFField::from(new_arrow_field) - } + let new_arrow_field = field.as_ref().clone().with_metadata(metadata); + (qualifier.cloned(), Arc::new(new_arrow_field)) }) .collect::>(); diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index d1f9f87a32a3..55fb982d2a87 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -188,7 +188,7 @@ mod tests { test_table_scan_fields, test_table_scan_with_name, }; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{Column, DFField, DFSchema, ScalarValue}; + use datafusion_common::{Column, DFSchema, ScalarValue}; use datafusion_expr::logical_plan::table_scan; use datafusion_expr::{ binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder, Expr, JoinType, @@ -373,14 +373,14 @@ mod tests { fn test_empty_with_non_empty() -> Result<()> { let table_scan = test_table_scan()?; - let fields = test_table_scan_fields() - .into_iter() - .map(DFField::from) - .collect(); + let fields = test_table_scan_fields(); let empty = LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new(DFSchema::new_with_metadata(fields, Default::default())?), + schema: Arc::new(DFSchema::from_unqualifed_fields( + fields.into(), + Default::default(), + )?), }); let one = LogicalPlanBuilder::from(empty.clone()).build()?; diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index e93e171e0324..83db4b0640a4 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -26,8 +26,8 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; use datafusion_common::{ - internal_err, plan_datafusion_err, Column, DFSchema, DFSchemaRef, JoinConstraint, - Result, + internal_err, plan_datafusion_err, qualified_name, Column, DFSchema, DFSchemaRef, + JoinConstraint, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::expr_rewriter::replace_col; @@ -198,13 +198,12 @@ fn on_lr_is_preserved(plan: &LogicalPlan) -> Result<(bool, bool)> { // relevant columns are contained on the relevant join side's schema. fn can_pushdown_join_predicate(predicate: &Expr, schema: &DFSchema) -> Result { let schema_columns = schema - .fields() .iter() - .flat_map(|f| { + .flat_map(|(qualifier, field)| { [ - f.qualified_column(), + Column::new(qualifier.cloned(), field.name()), // we need to push down filter using unqualified column as well - f.unqualified_column(), + Column::new_unqualified(field.name()), ] }) .collect::>(); @@ -305,13 +304,12 @@ fn extract_or_clauses_for_join<'a>( schema: &'a DFSchema, ) -> impl Iterator + 'a { let schema_columns = schema - .fields() .iter() - .flat_map(|f| { + .flat_map(|(qualifier, field)| { [ - f.qualified_column(), + Column::new(qualifier.cloned(), field.name()), // we need to push down filter using unqualified column as well - f.unqualified_column(), + Column::new_unqualified(field.name()), ] }) .collect::>(); @@ -672,17 +670,14 @@ impl OptimizerRule for PushDownFilter { } LogicalPlan::SubqueryAlias(subquery_alias) => { let mut replace_map = HashMap::new(); - for (i, field) in - subquery_alias.input.schema().fields().iter().enumerate() + for (i, (qualifier, field)) in + subquery_alias.input.schema().iter().enumerate() { + let (sub_qualifier, sub_field) = + subquery_alias.schema.qualified_field(i); replace_map.insert( - subquery_alias - .schema - .fields() - .get(i) - .unwrap() - .qualified_name(), - Expr::Column(field.qualified_column()), + qualified_name(sub_qualifier, sub_field.name()), + Expr::Column(Column::new(qualifier.cloned(), field.name())), ); } let new_predicate = @@ -700,17 +695,16 @@ impl OptimizerRule for PushDownFilter { let (volatile_map, non_volatile_map): (HashMap<_, _>, HashMap<_, _>) = projection .schema - .fields() .iter() .enumerate() - .map(|(i, field)| { + .map(|(i, (qualifier, field))| { // strip alias, as they should not be part of filters let expr = match &projection.expr[i] { Expr::Alias(Alias { expr, .. }) => expr.as_ref().clone(), expr => expr.clone(), }; - (field.qualified_name(), expr) + (qualified_name(qualifier, field.name()), expr) }) .partition(|(_, value)| { is_volatile_expression(value).unwrap_or(true) @@ -760,10 +754,12 @@ impl OptimizerRule for PushDownFilter { let mut inputs = Vec::with_capacity(union.inputs.len()); for input in &union.inputs { let mut replace_map = HashMap::new(); - for (i, field) in input.schema().fields().iter().enumerate() { + for (i, (qualifier, field)) in input.schema().iter().enumerate() { + let (union_qualifier, union_field) = + union.schema.qualified_field(i); replace_map.insert( - union.schema.fields().get(i).unwrap().qualified_name(), - Expr::Column(field.qualified_column()), + qualified_name(union_qualifier, union_field.name()), + Expr::Column(Column::new(qualifier.cloned(), field.name())), ); } diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index 28b3ff090fe6..ccdcf2f65bc8 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -29,7 +29,7 @@ mod tests { use crate::test::*; use crate::OptimizerContext; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{Column, DFField, DFSchema, Result}; + use datafusion_common::{Column, DFSchema, Result}; use datafusion_expr::builder::table_scan_with_filters; use datafusion_expr::expr::{self, Cast}; use datafusion_expr::logical_plan::{ @@ -225,11 +225,20 @@ mod tests { **optimized_join.schema(), DFSchema::new_with_metadata( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "c1", DataType::UInt32, true), + ( + Some("test".into()), + Arc::new(Field::new("a", DataType::UInt32, false)) + ), + ( + Some("test".into()), + Arc::new(Field::new("b", DataType::UInt32, false)) + ), + ( + Some("test2".into()), + Arc::new(Field::new("c1", DataType::UInt32, true)) + ), ], - HashMap::new(), + HashMap::new() )?, ); @@ -268,11 +277,20 @@ mod tests { **optimized_join.schema(), DFSchema::new_with_metadata( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "c1", DataType::UInt32, true), + ( + Some("test".into()), + Arc::new(Field::new("a", DataType::UInt32, false)) + ), + ( + Some("test".into()), + Arc::new(Field::new("b", DataType::UInt32, false)) + ), + ( + Some("test2".into()), + Arc::new(Field::new("c1", DataType::UInt32, true)) + ), ], - HashMap::new(), + HashMap::new() )?, ); @@ -309,11 +327,20 @@ mod tests { **optimized_join.schema(), DFSchema::new_with_metadata( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "a", DataType::UInt32, true), + ( + Some("test".into()), + Arc::new(Field::new("a", DataType::UInt32, false)) + ), + ( + Some("test".into()), + Arc::new(Field::new("b", DataType::UInt32, false)) + ), + ( + Some("test2".into()), + Arc::new(Field::new("a", DataType::UInt32, true)) + ), ], - HashMap::new(), + HashMap::new() )?, ); diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs index 0666c324d12c..0055e329c29d 100644 --- a/datafusion/optimizer/src/replace_distinct_aggregate.rs +++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs @@ -18,7 +18,7 @@ use crate::optimizer::{ApplyOrder, ApplyOrder::BottomUp}; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::Result; +use datafusion_common::{Column, Result}; use datafusion_expr::utils::expand_wildcard; use datafusion_expr::{ aggregate_function::AggregateFunction as AggregateFunctionFunc, col, @@ -122,15 +122,12 @@ impl OptimizerRule for ReplaceDistinctWithAggregate { // expressions, for `DISTINCT ON` we only need to emit the original selection expressions. let project_exprs = plan .schema() - .fields() .iter() .skip(on_expr.len()) - .zip(schema.fields().iter()) - .map(|(new_field, old_field)| { - Ok(col(new_field.qualified_column()).alias_qualified( - old_field.qualifier().cloned(), - old_field.name(), - )) + .zip(schema.iter()) + .map(|((new_qualifier, new_field), (old_qualifier, old_field))| { + Ok(col(Column::from((new_qualifier, new_field.as_ref()))) + .alias_qualified(old_qualifier.cloned(), old_field.name())) }) .collect::>>()?; diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 1cbe7decf15b..8b70f76617dd 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1729,7 +1729,7 @@ mod tests { use crate::test::test_table_scan_with_name; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{assert_contains, DFField, ToDFSchema}; + use datafusion_common::{assert_contains, ToDFSchema}; use datafusion_expr::{interval_arithmetic::Interval, *}; use datafusion_physical_expr::execution_props::ExecutionProps; @@ -3085,17 +3085,18 @@ mod tests { fn expr_test_schema() -> DFSchemaRef { Arc::new( - DFSchema::new_with_metadata( + DFSchema::from_unqualifed_fields( vec![ - DFField::new_unqualified("c1", DataType::Utf8, true), - DFField::new_unqualified("c2", DataType::Boolean, true), - DFField::new_unqualified("c3", DataType::Int64, true), - DFField::new_unqualified("c4", DataType::UInt32, true), - DFField::new_unqualified("c1_non_null", DataType::Utf8, false), - DFField::new_unqualified("c2_non_null", DataType::Boolean, false), - DFField::new_unqualified("c3_non_null", DataType::Int64, false), - DFField::new_unqualified("c4_non_null", DataType::UInt32, false), - ], + Field::new("c1", DataType::Utf8, true), + Field::new("c2", DataType::Boolean, true), + Field::new("c3", DataType::Int64, true), + Field::new("c4", DataType::UInt32, true), + Field::new("c1_non_null", DataType::Utf8, false), + Field::new("c2_non_null", DataType::Boolean, false), + Field::new("c3_non_null", DataType::Int64, false), + Field::new("c4_non_null", DataType::UInt32, false), + ] + .into(), HashMap::new(), ) .unwrap(), diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 07a9d84f7d48..5b47abb308d0 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::{DFSchema, Result}; +use datafusion_common::{qualified_name, DFSchema, Result}; use datafusion_expr::expr::AggregateFunctionDefinition; use datafusion_expr::{ aggregate_function::AggregateFunction::{Max, Min, Sum}, @@ -118,7 +118,6 @@ impl OptimizerRule for SingleDistinctToGroupBy { .. }) => { if is_single_distinct_agg(plan)? && !contains_grouping_set(group_expr) { - let fields = schema.fields(); // alias all original group_by exprs let (mut inner_group_exprs, out_group_expr_with_alias): ( Vec, @@ -150,9 +149,13 @@ impl OptimizerRule for SingleDistinctToGroupBy { // Second aggregate refers to the `test.a + Int32(1)` expression However, its input do not have `test.a` expression in it. let alias_str = format!("group_alias_{i}"); let alias_expr = group_expr.clone().alias(&alias_str); + let (qualifier, field) = schema.qualified_field(i); ( alias_expr, - (col(alias_str), Some(fields[i].qualified_name())), + ( + col(alias_str), + Some(qualified_name(qualifier, field.name())), + ), ) } }) @@ -266,7 +269,8 @@ impl OptimizerRule for SingleDistinctToGroupBy { }) .chain(outer_aggr_exprs.iter().enumerate().map(|(idx, expr)| { let idx = idx + group_size; - let name = fields[idx].qualified_name(); + let (qualifier, field) = schema.qualified_field(idx); + let name = qualified_name(qualifier, field.name()); columnize_expr(expr.clone().alias(name), &outer_aggr_schema) })) .collect(); diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 196a35ee9ae8..e4a777e7c71a 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -484,7 +484,7 @@ mod tests { use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::{DataType, Field}; use datafusion_common::tree_node::{TransformedResult, TreeNode}; - use datafusion_common::{DFField, DFSchema, DFSchemaRef, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; use datafusion_expr::{cast, col, in_list, lit, try_cast, Expr}; #[test] @@ -740,25 +740,18 @@ mod tests { fn expr_test_schema() -> DFSchemaRef { Arc::new( - DFSchema::new_with_metadata( + DFSchema::from_unqualifed_fields( vec![ - DFField::new_unqualified("c1", DataType::Int32, false), - DFField::new_unqualified("c2", DataType::Int64, false), - DFField::new_unqualified("c3", DataType::Decimal128(18, 2), false), - DFField::new_unqualified("c4", DataType::Decimal128(38, 37), false), - DFField::new_unqualified("c5", DataType::Float32, false), - DFField::new_unqualified("c6", DataType::UInt32, false), - DFField::new_unqualified( - "ts_nano_none", - timestamp_nano_none_type(), - false, - ), - DFField::new_unqualified( - "ts_nano_utf", - timestamp_nano_utc_type(), - false, - ), - ], + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int64, false), + Field::new("c3", DataType::Decimal128(18, 2), false), + Field::new("c4", DataType::Decimal128(38, 37), false), + Field::new("c5", DataType::Float32, false), + Field::new("c6", DataType::UInt32, false), + Field::new("ts_nano_none", timestamp_nano_none_type(), false), + Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), + ] + .into(), HashMap::new(), ) .unwrap(), diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index a1e471bdd422..f7be2704ab79 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -200,15 +200,10 @@ pub fn create_physical_fun( BuiltinScalarFunction::Nanvl => { Arc::new(|args| make_scalar_function_inner(math_expressions::nanvl)(args)) } - BuiltinScalarFunction::Radians => Arc::new(math_expressions::to_radians), BuiltinScalarFunction::Random => Arc::new(math_expressions::random), BuiltinScalarFunction::Round => { Arc::new(|args| make_scalar_function_inner(math_expressions::round)(args)) } - BuiltinScalarFunction::Signum => Arc::new(math_expressions::signum), - BuiltinScalarFunction::Sin => Arc::new(math_expressions::sin), - BuiltinScalarFunction::Sinh => Arc::new(math_expressions::sinh), - BuiltinScalarFunction::Sqrt => Arc::new(math_expressions::sqrt), BuiltinScalarFunction::Cbrt => Arc::new(math_expressions::cbrt), BuiltinScalarFunction::Trunc => { Arc::new(|args| make_scalar_function_inner(math_expressions::trunc)(args)) diff --git a/datafusion/physical-expr/src/math_expressions.rs b/datafusion/physical-expr/src/math_expressions.rs index 5339c12f6e93..acccb9cb3cd3 100644 --- a/datafusion/physical-expr/src/math_expressions.rs +++ b/datafusion/physical-expr/src/math_expressions.rs @@ -155,11 +155,8 @@ macro_rules! make_function_scalar_inputs_return_type { }}; } -math_unary_function!("sqrt", sqrt); math_unary_function!("cbrt", cbrt); -math_unary_function!("sin", sin); math_unary_function!("cos", cos); -math_unary_function!("sinh", sinh); math_unary_function!("cosh", cosh); math_unary_function!("asin", asin); math_unary_function!("acos", acos); @@ -169,13 +166,11 @@ math_unary_function!("acosh", acosh); math_unary_function!("atanh", atanh); math_unary_function!("floor", floor); math_unary_function!("ceil", ceil); -math_unary_function!("signum", signum); math_unary_function!("exp", exp); math_unary_function!("ln", ln); math_unary_function!("log2", log2); math_unary_function!("log10", log10); math_unary_function!("degrees", to_degrees); -math_unary_function!("radians", to_radians); /// Factorial SQL function pub fn factorial(args: &[ArrayRef]) -> Result { diff --git a/datafusion/physical-plan/src/metrics/builder.rs b/datafusion/physical-plan/src/metrics/builder.rs index 5e8ff72df35c..2037ddb70c2d 100644 --- a/datafusion/physical-plan/src/metrics/builder.rs +++ b/datafusion/physical-plan/src/metrics/builder.rs @@ -123,6 +123,15 @@ impl<'a> MetricBuilder<'a> { count } + /// Consume self and create a new counter for recording the total spilled rows + /// triggered by an operator + pub fn spilled_rows(self, partition: usize) -> Count { + let count = Count::new(); + self.with_partition(partition) + .build(MetricValue::SpilledRows(count.clone())); + count + } + /// Consume self and create a new gauge for reporting current memory usage pub fn mem_used(self, partition: usize) -> Gauge { let gauge = Gauge::new(); diff --git a/datafusion/physical-plan/src/metrics/mod.rs b/datafusion/physical-plan/src/metrics/mod.rs index b2e0086f69e9..9232865aa09c 100644 --- a/datafusion/physical-plan/src/metrics/mod.rs +++ b/datafusion/physical-plan/src/metrics/mod.rs @@ -70,7 +70,7 @@ pub struct Metric { /// The value of the metric value: MetricValue, - /// arbitrary name=value pairs identifiying this metric + /// arbitrary name=value pairs identifying this metric labels: Vec