Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-40592: [C++][Parquet] Implement SizeStatistics #40594

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ set(PARQUET_SRCS
printer.cc
properties.cc
schema.cc
size_statistics.cc
statistics.cc
stream_reader.cc
stream_writer.cc
Expand Down Expand Up @@ -377,6 +378,7 @@ add_parquet_test(internals-test
metadata_test.cc
page_index_test.cc
public_api_test.cc
size_statistics_test.cc
types_test.cc)

set_source_files_properties(public_api_test.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON
Expand Down
25 changes: 18 additions & 7 deletions cpp/src/parquet/column_page.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <optional>
#include <string>

#include "parquet/size_statistics.h"
#include "parquet/statistics.h"
#include "parquet/types.h"

Expand Down Expand Up @@ -69,27 +70,33 @@ class DataPage : public Page {
/// Currently it is only present from data pages created by ColumnWriter in order
/// to collect page index.
std::optional<int64_t> first_row_index() const { return first_row_index_; }
const std::shared_ptr<SizeStatistics>& size_statistics() const {
return size_statistics_;
}

virtual ~DataPage() = default;

protected:
DataPage(PageType::type type, const std::shared_ptr<Buffer>& buffer, int32_t num_values,
Encoding::type encoding, int64_t uncompressed_size,
EncodedStatistics statistics = EncodedStatistics(),
std::optional<int64_t> first_row_index = std::nullopt)
EncodedStatistics statistics, std::optional<int64_t> first_row_index,
std::shared_ptr<SizeStatistics> size_statistics)
: Page(buffer, type),
num_values_(num_values),
encoding_(encoding),
uncompressed_size_(uncompressed_size),
statistics_(std::move(statistics)),
first_row_index_(std::move(first_row_index)) {}
first_row_index_(std::move(first_row_index)),
size_statistics_(std::move(size_statistics)) {}

int32_t num_values_;
Encoding::type encoding_;
int64_t uncompressed_size_;
EncodedStatistics statistics_;
/// Row ordinal within the row group to the first row in the data page.
std::optional<int64_t> first_row_index_;
/// Size statistics for the data page. It may be null if unavailable.
std::shared_ptr<SizeStatistics> size_statistics_;
wgtmac marked this conversation as resolved.
Show resolved Hide resolved
};

class DataPageV1 : public DataPage {
Expand All @@ -98,9 +105,11 @@ class DataPageV1 : public DataPage {
Encoding::type encoding, Encoding::type definition_level_encoding,
Encoding::type repetition_level_encoding, int64_t uncompressed_size,
EncodedStatistics statistics = EncodedStatistics(),
std::optional<int64_t> first_row_index = std::nullopt)
std::optional<int64_t> first_row_index = std::nullopt,
std::shared_ptr<SizeStatistics> size_statistics = NULLPTR)
: DataPage(PageType::DATA_PAGE, buffer, num_values, encoding, uncompressed_size,
std::move(statistics), std::move(first_row_index)),
std::move(statistics), std::move(first_row_index),
std::move(size_statistics)),
definition_level_encoding_(definition_level_encoding),
repetition_level_encoding_(repetition_level_encoding) {}

Expand All @@ -120,9 +129,11 @@ class DataPageV2 : public DataPage {
int32_t definition_levels_byte_length, int32_t repetition_levels_byte_length,
int64_t uncompressed_size, bool is_compressed = false,
EncodedStatistics statistics = EncodedStatistics(),
std::optional<int64_t> first_row_index = std::nullopt)
std::optional<int64_t> first_row_index = std::nullopt,
std::shared_ptr<SizeStatistics> size_statistics = NULLPTR)
: DataPage(PageType::DATA_PAGE_V2, buffer, num_values, encoding, uncompressed_size,
std::move(statistics), std::move(first_row_index)),
std::move(statistics), std::move(first_row_index),
std::move(size_statistics)),
num_nulls_(num_nulls),
num_rows_(num_rows),
definition_levels_byte_length_(definition_levels_byte_length),
Expand Down
Loading
Loading