Skip to content

Commit

Permalink
rename builder functions
Browse files Browse the repository at this point in the history
  • Loading branch information
wgtmac committed Jun 23, 2024
1 parent 0d19b84 commit 82f4b34
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 49 deletions.
18 changes: 9 additions & 9 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1595,15 +1595,15 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
}

if (descr_->max_definition_level() > 0) {
page_size_stats_builder_->WriteDefinitionLevels(num_levels, def_levels);
page_size_stats_builder_->AddDefinitionLevels(num_levels, def_levels);
} else {
page_size_stats_builder_->WriteDefinitionLevel(num_levels, /*def_level=*/0);
page_size_stats_builder_->AddDefinitionLevel(num_levels, /*def_level=*/0);
}

if (descr_->max_repetition_level() > 0) {
page_size_stats_builder_->WriteRepetitionLevels(num_levels, rep_levels);
page_size_stats_builder_->AddRepetitionLevels(num_levels, rep_levels);
} else {
page_size_stats_builder_->WriteRepetitionLevel(num_levels, /*rep_level=*/0);
page_size_stats_builder_->AddRepetitionLevel(num_levels, /*rep_level=*/0);
}
}

Expand Down Expand Up @@ -1660,7 +1660,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
}
if constexpr (std::is_same_v<T, ByteArray>) {
if (page_size_stats_builder_ != nullptr) {
page_size_stats_builder_->WriteValues(values, num_values);
page_size_stats_builder_->AddValues(values, num_values);
}
}
}
Expand Down Expand Up @@ -1693,8 +1693,8 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter<
}
if constexpr (std::is_same_v<T, ByteArray>) {
if (page_size_stats_builder_ != nullptr) {
page_size_stats_builder_->WriteValuesSpaced(values, valid_bits, valid_bits_offset,
num_spaced_values);
page_size_stats_builder_->AddValuesSpaced(values, valid_bits, valid_bits_offset,
num_spaced_values);
}
}
}
Expand Down Expand Up @@ -1783,7 +1783,7 @@ Status TypedColumnWriterImpl<DType>::WriteArrowDictionary(
page_statistics_->Update(*referenced_dictionary, /*update_counts=*/false);
}
if (page_size_stats_builder_) {
page_size_stats_builder_->WriteValues(*referenced_dictionary);
page_size_stats_builder_->AddValues(*referenced_dictionary);
}
};

Expand Down Expand Up @@ -2289,7 +2289,7 @@ Status TypedColumnWriterImpl<ByteArrayType>::WriteArrowDense(
page_statistics_->IncrementNumValues(non_null);
}
if (page_size_stats_builder_ != nullptr) {
page_size_stats_builder_->WriteValues(*data_slice);
page_size_stats_builder_->AddValues(*data_slice);
}
CommitWriteAndCheckPageLimit(batch_size, batch_num_values, batch_size - non_null,
check_page);
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/parquet/page_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -697,8 +697,11 @@ class OffsetIndexBuilderImpl final : public OffsetIndexBuilder {
offset_index_.unencoded_byte_array_data_bytes.size()) {
offset_index_.__isset.unencoded_byte_array_data_bytes = true;
} else {
/// Discard unencoded_byte_array_data_bytes if its size is abnormal.
offset_index_.unencoded_byte_array_data_bytes.clear();
std::stringstream ss;
ss << "Invalid count of unencoded BYTE_ARRAY data bytes: "
<< offset_index_.unencoded_byte_array_data_bytes.size()
<< ", expected page count: " << offset_index_.page_locations.size();
throw ParquetException(ss.str());
}

state_ = BuilderState::kFinished;
Expand Down
24 changes: 12 additions & 12 deletions cpp/src/parquet/size_statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -216,36 +216,36 @@ class SizeStatisticsBuilder::SizeStatisticsBuilderImpl {
std::optional<int64_t> unencoded_byte_array_data_bytes_;
};

void SizeStatisticsBuilder::WriteRepetitionLevels(int64_t num_levels,
const int16_t* rep_levels) {
void SizeStatisticsBuilder::AddRepetitionLevels(int64_t num_levels,
const int16_t* rep_levels) {
impl_->WriteRepetitionLevels(num_levels, rep_levels);
}

void SizeStatisticsBuilder::WriteDefinitionLevels(int64_t num_levels,
const int16_t* def_levels) {
void SizeStatisticsBuilder::AddDefinitionLevels(int64_t num_levels,
const int16_t* def_levels) {
impl_->WriteDefinitionLevels(num_levels, def_levels);
}

void SizeStatisticsBuilder::WriteRepetitionLevel(int64_t num_levels, int16_t rep_level) {
void SizeStatisticsBuilder::AddRepetitionLevel(int64_t num_levels, int16_t rep_level) {
impl_->WriteRepetitionLevel(num_levels, rep_level);
}

void SizeStatisticsBuilder::WriteDefinitionLevel(int64_t num_levels, int16_t def_level) {
void SizeStatisticsBuilder::AddDefinitionLevel(int64_t num_levels, int16_t def_level) {
impl_->WriteDefinitionLevel(num_levels, def_level);
}

void SizeStatisticsBuilder::WriteValuesSpaced(const ByteArray* values,
const uint8_t* valid_bits,
int64_t valid_bits_offset,
int64_t num_spaced_values) {
void SizeStatisticsBuilder::AddValuesSpaced(const ByteArray* values,
const uint8_t* valid_bits,
int64_t valid_bits_offset,
int64_t num_spaced_values) {
impl_->WriteValuesSpaced(values, valid_bits, valid_bits_offset, num_spaced_values);
}

void SizeStatisticsBuilder::WriteValues(const ByteArray* values, int64_t num_values) {
void SizeStatisticsBuilder::AddValues(const ByteArray* values, int64_t num_values) {
impl_->WriteValues(values, num_values);
}

void SizeStatisticsBuilder::WriteValues(const ::arrow::Array& values) {
void SizeStatisticsBuilder::AddValues(const ::arrow::Array& values) {
impl_->WriteValues(values);
}

Expand Down
22 changes: 13 additions & 9 deletions cpp/src/parquet/size_statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ class ColumnDescriptor;
class PARQUET_EXPORT SizeStatistics {
public:
/// \brief API convenience to get a SizeStatistics accessor
///
/// \param size_statistics pointer to the thrift SizeStatistics structure.
/// \param descr column descriptor for the column.
/// \returns SizeStatistics object. Its lifetime is not bound to the input.
static std::unique_ptr<SizeStatistics> Make(const void* size_statistics,
const ColumnDescriptor* descr);

Expand Down Expand Up @@ -107,38 +111,38 @@ class PARQUET_EXPORT SizeStatisticsBuilder {
/// \brief Add repetition levels to the histogram.
/// \param num_levels number of repetition levels to add.
/// \param rep_levels repetition levels to add.
void WriteRepetitionLevels(int64_t num_levels, const int16_t* rep_levels);
void AddRepetitionLevels(int64_t num_levels, const int16_t* rep_levels);

/// \brief Add definition levels to the histogram.
/// \param num_levels number of definition levels to add.
/// \param def_levels definition levels to add.
void WriteDefinitionLevels(int64_t num_levels, const int16_t* def_levels);
void AddDefinitionLevels(int64_t num_levels, const int16_t* def_levels);

/// \brief Add repeated repetition level to the histogram.
/// \param num_levels number of repetition levels to add.
/// \param rep_level repeated repetition level value.
void WriteRepetitionLevel(int64_t num_levels, int16_t rep_level);
void AddRepetitionLevel(int64_t num_levels, int16_t rep_level);

/// \brief Add repeated definition level to the histogram.
/// \param num_levels number of definition levels to add.
/// \param def_level repeated definition level value.
void WriteDefinitionLevel(int64_t num_levels, int16_t def_level);
void AddDefinitionLevel(int64_t num_levels, int16_t def_level);

/// \brief Add spaced BYTE_ARRAY values.
/// \param[in] values pointer to values of BYTE_ARRAY type.
/// \param[in] valid_bits pointer to bitmap representing if values are non-null.
/// \param[in] valid_bits_offset offset into valid_bits where the slice of data begins.
/// \param[in] num_spaced_values length of values in values/valid_bits to inspect.
void WriteValuesSpaced(const ByteArray* values, const uint8_t* valid_bits,
int64_t valid_bits_offset, int64_t num_spaced_values);
void AddValuesSpaced(const ByteArray* values, const uint8_t* valid_bits,
int64_t valid_bits_offset, int64_t num_spaced_values);

/// \brief Add dense BYTE_ARRAY values.
/// \param values pointer to values of BYTE_ARRAY type.
/// \param num_values length of values.
void WriteValues(const ByteArray* values, int64_t num_values);
void AddValues(const ByteArray* values, int64_t num_values);

/// \brief Add BYTE_ARRAY values in the arrow array.
void WriteValues(const ::arrow::Array& values);
void AddValues(const ::arrow::Array& values);

/// \brief Build a SizeStatistics from collected data.
std::unique_ptr<SizeStatistics> Build();
Expand All @@ -148,7 +152,7 @@ class PARQUET_EXPORT SizeStatisticsBuilder {

private:
// PIMPL Idiom
SizeStatisticsBuilder(const ColumnDescriptor* descr);
explicit SizeStatisticsBuilder(const ColumnDescriptor* descr);
class SizeStatisticsBuilderImpl;
std::unique_ptr<SizeStatisticsBuilderImpl> impl_;
};
Expand Down
34 changes: 17 additions & 17 deletions cpp/src/parquet/size_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@ TEST(SizeStatistics, WriteBatchLevels) {
};

write_batch_levels(expected_def_level_histogram,
&SizeStatisticsBuilder::WriteDefinitionLevels);
&SizeStatisticsBuilder::AddDefinitionLevels);
write_batch_levels(expected_rep_level_histogram,
&SizeStatisticsBuilder::WriteRepetitionLevels);
&SizeStatisticsBuilder::AddRepetitionLevels);
auto size_statistics = builder->Build();
EXPECT_EQ(size_statistics->definition_level_histogram(), expected_def_level_histogram);
EXPECT_EQ(size_statistics->repetition_level_histogram(), expected_rep_level_histogram);
Expand All @@ -79,11 +79,11 @@ TEST(SizeStatistics, WriteRepeatedLevels) {
constexpr int64_t kNumRounds = 10;
for (int64_t round = 1; round <= kNumRounds; round++) {
for (int16_t def_level = 0; def_level <= kMaxDefLevel; def_level++) {
builder->WriteDefinitionLevel(/*num_levels=*/round + def_level, def_level);
builder->AddDefinitionLevel(/*num_levels=*/round + def_level, def_level);
}
for (int16_t rep_level = 0; rep_level <= kMaxRepLevel; rep_level++) {
builder->WriteRepetitionLevel(/*num_levels=*/round + rep_level * rep_level,
rep_level);
builder->AddRepetitionLevel(/*num_levels=*/round + rep_level * rep_level,
rep_level);
}
}

Expand All @@ -105,7 +105,7 @@ TEST(SizeStatistics, WriteDenseByteArrayValues) {
auto builder = SizeStatisticsBuilder::Make(descr.get());
for (int i = 0; i < kNumValues; i += kBatchSize) {
auto batch_size = std::min(kBatchSize, kNumValues - i);
builder->WriteValues(values.data() + i, batch_size);
builder->AddValues(values.data() + i, batch_size);
}

auto size_statistics = builder->Build();
Expand Down Expand Up @@ -134,7 +134,7 @@ TEST(SizeStatistics, WriteSpacedByteArrayValues) {
auto builder = SizeStatisticsBuilder::Make(descr.get());
for (int i = 0; i < kNumValues; i += kBatchSize) {
auto batch_size = std::min(kBatchSize, kNumValues - i);
builder->WriteValuesSpaced(values.data() + i, not_null_bitmap->data(), i, batch_size);
builder->AddValuesSpaced(values.data() + i, not_null_bitmap->data(), i, batch_size);
}

auto size_statistics = builder->Build();
Expand All @@ -151,7 +151,7 @@ TEST(SizeStatistics, WriteBinaryArray) {
auto descr = std::make_unique<ColumnDescriptor>(
schema::ByteArray("a"), /*max_def_level=*/1, /*max_rep_level=*/0);
auto builder = SizeStatisticsBuilder::Make(descr.get());
builder->WriteValues(*array);
builder->AddValues(*array);
auto size_statistics = builder->Build();
EXPECT_EQ(size_statistics->unencoded_byte_array_data_bytes().value_or(-1), 9);
}
Expand All @@ -174,18 +174,18 @@ TEST(SizeStatistics, MergeStatistics) {
std::make_unique<ColumnDescriptor>(schema::ByteArray("a"), /*max_def_level=*/3,
/*max_rep_level=*/3)}) {
auto builder = SizeStatisticsBuilder::Make(descr.get());
builder->WriteRepetitionLevels(kNumValues, def_levels.data());
builder->WriteDefinitionLevels(kNumValues, rep_levels.data());
builder->AddRepetitionLevels(kNumValues, def_levels.data());
builder->AddDefinitionLevels(kNumValues, rep_levels.data());
if (descr->physical_type() == Type::BYTE_ARRAY) {
builder->WriteValues(values.data(), kNumValues);
builder->AddValues(values.data(), kNumValues);
}
auto size_statistics_1 = builder->Build();

builder->Reset();
builder->WriteRepetitionLevels(kNumValues, def_levels.data());
builder->WriteDefinitionLevels(kNumValues, rep_levels.data());
builder->AddRepetitionLevels(kNumValues, def_levels.data());
builder->AddDefinitionLevels(kNumValues, rep_levels.data());
if (descr->physical_type() == Type::BYTE_ARRAY) {
builder->WriteValues(values.data(), kNumValues);
builder->AddValues(values.data(), kNumValues);
}
auto size_statistics_2 = builder->Build();

Expand Down Expand Up @@ -219,10 +219,10 @@ TEST(SizeStatistics, ThriftSerDe) {
std::make_unique<ColumnDescriptor>(schema::ByteArray("a"), /*max_def_level=*/3,
/*max_rep_level=*/3)}) {
auto builder = SizeStatisticsBuilder::Make(descr.get());
builder->WriteRepetitionLevels(kNumValues, def_levels.data());
builder->WriteDefinitionLevels(kNumValues, rep_levels.data());
builder->AddRepetitionLevels(kNumValues, def_levels.data());
builder->AddDefinitionLevels(kNumValues, rep_levels.data());
if (descr->physical_type() == Type::BYTE_ARRAY) {
builder->WriteValues(values.data(), kNumValues);
builder->AddValues(values.data(), kNumValues);
}
auto size_statistics = builder->Build();
auto thrift_statistics = ToThrift(*size_statistics);
Expand Down

0 comments on commit 82f4b34

Please sign in to comment.