Skip to content

Commit

Permalink
Return an error instead of a panic when reading a corrupted Parquet f…
Browse files Browse the repository at this point in the history
…ile with mismatched column counts (#5362)

* Return an error instead of a panic when reading a corrupted Parquet file with mismatched column counts

* Update parquet/src/file/metadata.rs

Co-authored-by: Jeffrey Vo <[email protected]>

* Fix test

---------

Co-authored-by: Matthieu Maitre <[email protected]>
Co-authored-by: Jeffrey Vo <[email protected]>
  • Loading branch information
3 people authored Feb 4, 2024
1 parent 79721ec commit 0dda129
Showing 1 changed file with 76 additions and 1 deletion.
77 changes: 76 additions & 1 deletion parquet/src/file/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,13 @@ impl RowGroupMetaData {

/// Method to convert from Thrift.
pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
assert_eq!(schema_descr.num_columns(), rg.columns.len());
if schema_descr.num_columns() != rg.columns.len() {
return Err(general_err!(
"Column count mismatch. Schema has {} columns while Row Group has {}",
schema_descr.num_columns(),
rg.columns.len()
));
}
let total_byte_size = rg.total_byte_size;
let num_rows = rg.num_rows;
let mut columns = vec![];
Expand Down Expand Up @@ -1039,6 +1045,75 @@ mod tests {
}
}

/// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
#[test]
fn test_row_group_metadata_thrift_corrupted() {
let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
SchemaType::group_type_builder("schema")
.with_fields(vec![
Arc::new(
SchemaType::primitive_type_builder("a", Type::INT32)
.build()
.unwrap(),
),
Arc::new(
SchemaType::primitive_type_builder("b", Type::INT32)
.build()
.unwrap(),
),
])
.build()
.unwrap(),
)));

let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
SchemaType::group_type_builder("schema")
.with_fields(vec![
Arc::new(
SchemaType::primitive_type_builder("a", Type::INT32)
.build()
.unwrap(),
),
Arc::new(
SchemaType::primitive_type_builder("b", Type::INT32)
.build()
.unwrap(),
),
Arc::new(
SchemaType::primitive_type_builder("c", Type::INT32)
.build()
.unwrap(),
),
])
.build()
.unwrap(),
)));

let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
.set_num_rows(1000)
.set_total_byte_size(2000)
.set_column_metadata(vec![
ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
.build()
.unwrap(),
ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
.build()
.unwrap(),
])
.set_ordinal(1)
.build()
.unwrap();

let err =
RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
.unwrap_err()
.to_string();
assert_eq!(
err,
"Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
);
}

#[test]
fn test_column_chunk_metadata_thrift_conversion() {
let column_descr = get_test_schema_descr().column(0);
Expand Down

0 comments on commit 0dda129

Please sign in to comment.