From d48010e5717a1fb54822ad3a8c41f084cff05c92 Mon Sep 17 00:00:00 2001 From: Ed Seidl Date: Wed, 25 Sep 2024 16:55:21 -0700 Subject: [PATCH] Workaround for missing Parquet page indexes in `ParquetMetadaReader` (#6450) * workaround for missing page indexes * remove empty line * Apply suggestions from code review Co-authored-by: Andrew Lamb * fmt --------- Co-authored-by: Andrew Lamb --- parquet/src/arrow/arrow_reader/mod.rs | 26 ++++++------------------- parquet/src/file/metadata/reader.rs | 28 +++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 253625117c2b..a109851f72b6 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -34,9 +34,7 @@ use crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField}; use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask}; use crate::column::page::{PageIterator, PageReader}; use crate::errors::{ParquetError, Result}; -use crate::file::footer; -use crate::file::metadata::ParquetMetaData; -use crate::file::page_index::index_reader; +use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader}; use crate::file::reader::{ChunkReader, SerializedPageReader}; use crate::schema::types::SchemaDescriptor; @@ -382,23 +380,9 @@ impl ArrowReaderMetadata { /// `Self::metadata` is missing the page index, this function will attempt /// to load the page index by making an object store request. pub fn load(reader: &T, options: ArrowReaderOptions) -> Result { - let mut metadata = footer::parse_metadata(reader)?; - if options.page_index { - let column_index = metadata - .row_groups() - .iter() - .map(|rg| index_reader::read_columns_indexes(reader, rg.columns())) - .collect::>>()?; - metadata.set_column_index(Some(column_index)); - - let offset_index = metadata - .row_groups() - .iter() - .map(|rg| index_reader::read_offset_indexes(reader, rg.columns())) - .collect::>>()?; - - metadata.set_offset_index(Some(offset_index)) - } + let metadata = ParquetMetaDataReader::new() + .with_page_indexes(options.page_index) + .parse_and_finish(reader)?; Self::try_new(Arc::new(metadata), options) } @@ -3498,6 +3482,8 @@ mod tests { .unwrap(); // Although `Vec>` of each row group is empty, // we should read the file successfully. + // FIXME: this test will fail when metadata parsing returns `None` for missing page + // indexes. https://github.com/apache/arrow-rs/issues/6447 assert!(builder.metadata().offset_index().unwrap()[0].is_empty()); let reader = builder.build().unwrap(); let batches = reader.collect::, _>>().unwrap(); diff --git a/parquet/src/file/metadata/reader.rs b/parquet/src/file/metadata/reader.rs index 333e0fd6e90d..4a5a1bc93c4f 100644 --- a/parquet/src/file/metadata/reader.rs +++ b/parquet/src/file/metadata/reader.rs @@ -201,6 +201,7 @@ impl ParquetMetaDataReader { // need for more data. This is not it's intended use. The plan is to add a NeedMoreData // value to the enum, but this would be a breaking change. This will be done as // 54.0.0 draws nearer. + // https://github.com/apache/arrow-rs/issues/6447 Err(ParquetError::IndexOutOfBound(needed, _)) => { // If reader is the same length as `file_size` then presumably there is no more to // read, so return an EOF error. @@ -247,13 +248,18 @@ impl ParquetMetaDataReader { )); } - // TODO(ets): what is the correct behavior for missing page indexes? MetadataLoader would - // leave them as `None`, while the parser in `index_reader::read_columns_indexes` returns a - // vector of empty vectors. - // I think it's best to leave them as `None`. + // FIXME: there are differing implementations in the case where page indexes are missing + // from the file. `MetadataLoader` will leave them as `None`, while the parser in + // `index_reader::read_columns_indexes` returns a vector of empty vectors. + // It is best for this function to replicate the latter behavior for now, but in a future + // breaking release, the two paths to retrieve metadata should be made consistent. Note that this is only + // an issue if the user requested page indexes, so there is no need to provide empty + // vectors in `try_parse_sized()`. + // https://github.com/apache/arrow-rs/issues/6447 // Get bounds needed for page indexes (if any are present in the file). let Some(range) = self.range_for_page_index() else { + self.empty_page_indexes(); return Ok(()); }; @@ -412,6 +418,20 @@ impl ParquetMetaDataReader { Ok(()) } + /// Set the column_index and offset_indexes to empty `Vec` for backwards compatibility + /// + /// See for details + fn empty_page_indexes(&mut self) { + let metadata = self.metadata.as_mut().unwrap(); + let num_row_groups = metadata.num_row_groups(); + if self.column_index { + metadata.set_column_index(Some(vec![vec![]; num_row_groups])); + } + if self.offset_index { + metadata.set_offset_index(Some(vec![vec![]; num_row_groups])); + } + } + fn range_for_page_index(&self) -> Option> { // sanity check self.metadata.as_ref()?;