Skip to content

Commit

Permalink
Add ParquetMetadataWriter allow ad-hoc encoding of ParquetMetadata
Browse files Browse the repository at this point in the history
  • Loading branch information
adriangb committed Jul 24, 2024
1 parent 3bc9987 commit b38ccf7
Show file tree
Hide file tree
Showing 5 changed files with 500 additions and 103 deletions.
8 changes: 6 additions & 2 deletions parquet/src/arrow/async_reader/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ use crate::errors::{ParquetError, Result};
use crate::file::footer::{decode_footer, decode_metadata};
use crate::file::metadata::ParquetMetaData;
use crate::file::page_index::index::Index;
use crate::file::page_index::index_reader::{acc_range, decode_column_index, decode_offset_index};
use crate::file::page_index::index_reader::{
acc_range, decode_column_index, decode_page_locations,
};
use bytes::Bytes;
use futures::future::BoxFuture;
use futures::FutureExt;
Expand Down Expand Up @@ -177,7 +179,9 @@ impl<F: MetadataFetch> MetadataLoader<F> {
x.columns()
.iter()
.map(|c| match c.offset_index_range() {
Some(r) => decode_offset_index(&data[r.start - offset..r.end - offset]),
Some(r) => {
decode_page_locations(&data[r.start - offset..r.end - offset])
}
None => Err(general_err!("missing offset index")),
})
.collect::<Result<Vec<_>>>()
Expand Down
4 changes: 2 additions & 2 deletions parquet/src/file/metadata/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
///
/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
/// [`parse_metadata`]: crate::file::footer::parse_metadata
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq)]
pub struct ParquetMetaData {
/// File level metadata
file_metadata: FileMetaData,
Expand Down Expand Up @@ -222,7 +222,7 @@ pub type FileMetaDataPtr = Arc<FileMetaData>;
/// File level metadata for a Parquet file.
///
/// Includes the version of the file, metadata, number of rows, schema, and column orders
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq)]
pub struct FileMetaData {
version: i32,
num_rows: i64,
Expand Down
32 changes: 32 additions & 0 deletions parquet/src/file/page_index/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,38 @@ impl<T: ParquetValueType> NativeIndex<T> {
boundary_order: index.boundary_order,
})
}

pub(crate) fn to_thrift(&self) -> ColumnIndex {
let min_values = self
.indexes
.iter()
.map(|x| x.min_bytes().map(|x| x.to_vec()))
.collect::<Option<Vec<_>>>()
.unwrap_or_else(|| vec![vec![]; self.indexes.len()]);

let max_values = self
.indexes
.iter()
.map(|x| x.max_bytes().map(|x| x.to_vec()))
.collect::<Option<Vec<_>>>()
.unwrap_or_else(|| vec![vec![]; self.indexes.len()]);

let null_counts = self
.indexes
.iter()
.map(|x| x.null_count())
.collect::<Option<Vec<_>>>();

ColumnIndex::new(
self.indexes.iter().map(|x| x.min().is_none()).collect(),
min_values,
max_values,
self.boundary_order,
null_counts,
None,
None,
)
}
}

#[cfg(test)]
Expand Down
Loading

0 comments on commit b38ccf7

Please sign in to comment.