diff --git a/velox/common/base/BitSet.h b/velox/common/base/BitSet.h index 765eb1f7073a..bcdb6e0a38ba 100644 --- a/velox/common/base/BitSet.h +++ b/velox/common/base/BitSet.h @@ -22,12 +22,12 @@ #include "velox/common/base/BitUtil.h" namespace facebook::velox { -// Dynamic size dense bit set that Keeps track of maximum set bit. +/// Dynamic size dense bit set that keeps track of maximum set bit. class BitSet { public: - // Constructs a bitSet. 'min' is the lowest possible member of the - // set. Values below this are not present and inserting these is a - // no-op. 'min' is used when using this as an IN predicate filter. + /// Constructs a bitSet. 'min' is the lowest possible member of the set. + /// Values below this are not present and inserting these is a no-op. 'min' is + /// used when using this as an IN predicate filter. explicit BitSet(int64_t min) : min_(min) {} void insert(int64_t index) { @@ -56,7 +56,7 @@ class BitSet { return bits::isBitSet(bits_.data(), bit); } - // Returns the largest element of the set or 'min_ - 1' if empty. + /// Returns the largest element of the set or 'min_ - 1' if empty. int64_t max() const { return lastSetBit_ + min_; } @@ -66,8 +66,8 @@ class BitSet { } private: - std::vector bits_; const int64_t min_; + std::vector bits_; int64_t lastSetBit_ = -1; }; diff --git a/velox/common/base/BitUtil.h b/velox/common/base/BitUtil.h index 7ac847a33db8..75b118ce094f 100644 --- a/velox/common/base/BitUtil.h +++ b/velox/common/base/BitUtil.h @@ -95,6 +95,11 @@ constexpr inline T roundUp(T value, U factor) { return (value + (factor - 1)) / factor * factor; } +template +constexpr inline T divRoundUp(T value, U factor) { + return (value + (factor - 1)) / factor; +} + constexpr inline uint64_t lowMask(int32_t bits) { return (1UL << bits) - 1; } diff --git a/velox/common/base/SelectivityInfo.h b/velox/common/base/SelectivityInfo.h index 825e476321a4..6d52790d637e 100644 --- a/velox/common/base/SelectivityInfo.h +++ b/velox/common/base/SelectivityInfo.h @@ -58,8 +58,8 @@ class SelectivityInfo { class SelectivityTimer { public: SelectivityTimer(SelectivityInfo& info, uint64_t numIn) - : startClocks_(folly::hardware_timestamp()), - totalClocks_(&info.timeClocks_) { + : totalClocks_(&info.timeClocks_), + startClocks_(folly::hardware_timestamp()) { info.numIn_ += numIn; } @@ -72,8 +72,8 @@ class SelectivityTimer { } private: - uint64_t startClocks_; uint64_t* const totalClocks_; + uint64_t startClocks_; }; } // namespace velox diff --git a/velox/common/base/tests/BitUtilTest.cpp b/velox/common/base/tests/BitUtilTest.cpp index f181e412d900..c0c31464d3b5 100644 --- a/velox/common/base/tests/BitUtilTest.cpp +++ b/velox/common/base/tests/BitUtilTest.cpp @@ -870,6 +870,67 @@ TEST_F(BitUtilTest, storeBitsToByte) { ASSERT_EQ(bytes[2], 0); } +TEST_F(BitUtilTest, roundUp) { + struct { + uint64_t value; + uint64_t factor; + uint64_t expected; + + std::string debugString() const { + return fmt::format( + "value: {}, factor: {}, expected: {}", value, factor, expected); + } + } testSettings[] = { + {10, 1, 10}, + {10, 3, 12}, + {10, 4, 12}, + {10, 10, 10}, + {10, 11, 11}, + {10, 20, 20}, + {11, 1, 11}, + {11, 3, 12}, + {11, 4, 12}, + {11, 11, 11}, + {11, 12, 12}, + {11, 23, 23}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + ASSERT_EQ( + bits::roundUp(testData.value, testData.factor), testData.expected); + } +} + +TEST_F(BitUtilTest, divRoundUp) { + struct { + uint64_t value; + uint64_t factor; + uint64_t expected; + + std::string debugString() const { + return fmt::format( + "value: {}, factor: {}, expected: {}", value, factor, expected); + } + } testSettings[] = { + {10, 1, 10}, + {10, 3, 4}, + {10, 4, 3}, + {10, 10, 1}, + {10, 11, 1}, + {10, 20, 1}, + {11, 1, 11}, + {11, 3, 4}, + {11, 4, 3}, + {11, 11, 1}, + {11, 12, 1}, + {11, 23, 1}}; + + for (const auto& testData : testSettings) { + SCOPED_TRACE(testData.debugString()); + ASSERT_EQ( + bits::divRoundUp(testData.value, testData.factor), testData.expected); + } +} } // namespace bits } // namespace velox } // namespace facebook diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index b32ec57eed78..236120e1763f 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -583,14 +583,14 @@ void configureReaderOptions( } void configureRowReaderOptions( - dwio::common::RowReaderOptions& rowReaderOptions, const std::unordered_map& tableParameters, const std::shared_ptr& scanSpec, std::shared_ptr metadataFilter, const RowTypePtr& rowType, const std::shared_ptr& hiveSplit, const std::shared_ptr& hiveConfig, - const config::ConfigBase* sessionProperties) { + const config::ConfigBase* sessionProperties, + dwio::common::RowReaderOptions& rowReaderOptions) { auto skipRowsIt = tableParameters.find(dwio::common::TableParameter::kSkipHeaderLineCount); if (skipRowsIt != tableParameters.end()) { @@ -649,22 +649,22 @@ bool testFilters( const dwio::common::Reader* reader, const std::string& filePath, const std::unordered_map>& - partitionKey, + partitionKeys, const std::unordered_map>& partitionKeysHandle) { - auto totalRows = reader->numberOfRows(); + const auto totalRows = reader->numberOfRows(); const auto& fileTypeWithId = reader->typeWithId(); const auto& rowType = reader->rowType(); for (const auto& child : scanSpec->children()) { if (child->filter()) { const auto& name = child->fieldName(); - auto iter = partitionKey.find(name); + auto iter = partitionKeys.find(name); // By design, the partition key columns for Iceberg tables are included in // the data files to facilitate partition transform and partition // evolution, so we need to test both cases. - if (!rowType->containsChild(name) || iter != partitionKey.end()) { - if (iter != partitionKey.end() && iter->second.has_value()) { - auto handlesIter = partitionKeysHandle.find(name); + if (!rowType->containsChild(name) || iter != partitionKeys.end()) { + if (iter != partitionKeys.end() && iter->second.has_value()) { + const auto handlesIter = partitionKeysHandle.find(name); VELOX_CHECK(handlesIter != partitionKeysHandle.end()); // This is a non-null partition key @@ -684,7 +684,7 @@ bool testFilters( } } else { const auto& typeWithId = fileTypeWithId->childByName(name); - auto columnStats = reader->columnStatistics(typeWithId->id()); + const auto columnStats = reader->columnStatistics(typeWithId->id()); if (columnStats != nullptr && !testFilter( child->filter(), diff --git a/velox/connectors/hive/HiveConnectorUtil.h b/velox/connectors/hive/HiveConnectorUtil.h index 3674ce41483b..3b5f25ad82ce 100644 --- a/velox/connectors/hive/HiveConnectorUtil.h +++ b/velox/connectors/hive/HiveConnectorUtil.h @@ -76,14 +76,14 @@ void configureReaderOptions( const std::unordered_map& tableParameters = {}); void configureRowReaderOptions( - dwio::common::RowReaderOptions& rowReaderOptions, const std::unordered_map& tableParameters, const std::shared_ptr& scanSpec, std::shared_ptr metadataFilter, const RowTypePtr& rowType, const std::shared_ptr& hiveSplit, - const std::shared_ptr& hiveConfig = nullptr, - const config::ConfigBase* sessionProperties = nullptr); + const std::shared_ptr& hiveConfig, + const config::ConfigBase* sessionProperties, + dwio::common::RowReaderOptions& rowReaderOptions); bool testFilters( const common::ScanSpec* scanSpec, diff --git a/velox/connectors/hive/HiveDataSource.cpp b/velox/connectors/hive/HiveDataSource.cpp index 8a4accc91b57..40b72f126dfe 100644 --- a/velox/connectors/hive/HiveDataSource.cpp +++ b/velox/connectors/hive/HiveDataSource.cpp @@ -66,11 +66,11 @@ HiveDataSource::HiveDataSource( folly::Executor* executor, const ConnectorQueryCtx* connectorQueryCtx, const std::shared_ptr& hiveConfig) - : pool_(connectorQueryCtx->memoryPool()), - fileHandleFactory_(fileHandleFactory), + : fileHandleFactory_(fileHandleFactory), executor_(executor), connectorQueryCtx_(connectorQueryCtx), hiveConfig_(hiveConfig), + pool_(connectorQueryCtx->memoryPool()), outputType_(outputType), expressionEvaluator_(connectorQueryCtx->expressionEvaluator()) { // Column handled keyed on the column alias, the name used in the query. @@ -300,6 +300,7 @@ vector_size_t HiveDataSource::applyBucketConversion( for (vector_size_t i = 0; i < rowVector->size(); ++i) { VELOX_CHECK_EQ((partitions_[i] - bucketToKeep) % partitionBucketCount, 0); } + if (remainingFilterExprSet_) { for (vector_size_t i = 0; i < rowVector->size(); ++i) { if (partitions_[i] != bucketToKeep) { @@ -345,76 +346,76 @@ std::optional HiveDataSource::next( output_ = BaseVector::create(readerOutputType_, 0, pool_); } - auto rowsScanned = splitReader_->next(size, output_); + const auto rowsScanned = splitReader_->next(size, output_); completedRows_ += rowsScanned; + if (rowsScanned == 0) { + splitReader_->updateRuntimeStats(runtimeStats_); + resetSplit(); + return nullptr; + } - if (rowsScanned) { - VELOX_CHECK( - !output_->mayHaveNulls(), "Top-level row vector cannot have nulls"); - auto rowsRemaining = output_->size(); + VELOX_CHECK( + !output_->mayHaveNulls(), "Top-level row vector cannot have nulls"); + auto rowsRemaining = output_->size(); + if (rowsRemaining == 0) { + // no rows passed the pushed down filters. + return getEmptyOutput(); + } + + auto rowVector = std::dynamic_pointer_cast(output_); + + // In case there is a remaining filter that excludes some but not all + // rows, collect the indices of the passing rows. If there is no filter, + // or it passes on all rows, leave this as null and let exec::wrap skip + // wrapping the results. + BufferPtr remainingIndices; + if (remainingFilterExprSet_) { + if (numBucketConversion_ > 0) { + filterRows_.resizeFill(rowVector->size()); + } else { + filterRows_.resize(rowVector->size()); + } + } + if (partitionFunction_) { + rowsRemaining = applyBucketConversion(rowVector, remainingIndices); if (rowsRemaining == 0) { - // no rows passed the pushed down filters. return getEmptyOutput(); } + } - auto rowVector = std::dynamic_pointer_cast(output_); - - // In case there is a remaining filter that excludes some but not all - // rows, collect the indices of the passing rows. If there is no filter, - // or it passes on all rows, leave this as null and let exec::wrap skip - // wrapping the results. - BufferPtr remainingIndices; - if (remainingFilterExprSet_) { - if (numBucketConversion_ > 0) { - filterRows_.resizeFill(rowVector->size()); - } else { - filterRows_.resize(rowVector->size()); - } - } - if (partitionFunction_) { - rowsRemaining = applyBucketConversion(rowVector, remainingIndices); - if (rowsRemaining == 0) { - return getEmptyOutput(); - } + if (remainingFilterExprSet_) { + rowsRemaining = evaluateRemainingFilter(rowVector); + VELOX_CHECK_LE(rowsRemaining, rowsScanned); + if (rowsRemaining == 0) { + // No rows passed the remaining filter. + return getEmptyOutput(); } - if (remainingFilterExprSet_) { - rowsRemaining = evaluateRemainingFilter(rowVector); - VELOX_CHECK_LE(rowsRemaining, rowsScanned); - if (rowsRemaining == 0) { - // No rows passed the remaining filter. - return getEmptyOutput(); - } - if (rowsRemaining < rowVector->size()) { - // Some, but not all rows passed the remaining filter. - remainingIndices = filterEvalCtx_.selectedIndices; - } + if (rowsRemaining < rowVector->size()) { + // Some, but not all rows passed the remaining filter. + remainingIndices = filterEvalCtx_.selectedIndices; } + } - if (outputType_->size() == 0) { - return exec::wrap(rowsRemaining, remainingIndices, rowVector); - } + if (outputType_->size() == 0) { + return exec::wrap(rowsRemaining, remainingIndices, rowVector); + } - std::vector outputColumns; - outputColumns.reserve(outputType_->size()); - for (int i = 0; i < outputType_->size(); i++) { - auto& child = rowVector->childAt(i); - if (remainingIndices) { - // Disable dictionary values caching in expression eval so that we - // don't need to reallocate the result for every batch. - child->disableMemo(); - } - outputColumns.emplace_back( - exec::wrapChild(rowsRemaining, remainingIndices, child)); + std::vector outputColumns; + outputColumns.reserve(outputType_->size()); + for (int i = 0; i < outputType_->size(); ++i) { + auto& child = rowVector->childAt(i); + if (remainingIndices) { + // Disable dictionary values caching in expression eval so that we + // don't need to reallocate the result for every batch. + child->disableMemo(); } - - return std::make_shared( - pool_, outputType_, BufferPtr(nullptr), rowsRemaining, outputColumns); + outputColumns.emplace_back( + exec::wrapChild(rowsRemaining, remainingIndices, child)); } - splitReader_->updateRuntimeStats(runtimeStats_); - resetSplit(); - return nullptr; + return std::make_shared( + pool_, outputType_, BufferPtr(nullptr), rowsRemaining, outputColumns); } void HiveDataSource::addDynamicFilter( @@ -505,15 +506,18 @@ vector_size_t HiveDataSource::evaluateRemainingFilter(RowVectorPtr& rowVector) { filterLazyDecoded_, filterLazyBaseRows_); } - auto filterStartMicros = getCurrentTimeMicro(); - expressionEvaluator_->evaluate( - remainingFilterExprSet_.get(), filterRows_, *rowVector, filterResult_); - auto res = exec::processFilterResults( - filterResult_, filterRows_, filterEvalCtx_, pool_); + uint64_t filterTimeUs{0}; + vector_size_t rowsRemaining{0}; + { + MicrosecondTimer timer(&filterTimeUs); + expressionEvaluator_->evaluate( + remainingFilterExprSet_.get(), filterRows_, *rowVector, filterResult_); + rowsRemaining = exec::processFilterResults( + filterResult_, filterRows_, filterEvalCtx_, pool_); + } totalRemainingFilterTime_.fetch_add( - (getCurrentTimeMicro() - filterStartMicros) * 1000, - std::memory_order_relaxed); - return res; + filterTimeUs * 1000, std::memory_order_relaxed); + return rowsRemaining; } void HiveDataSource::resetSplit() { diff --git a/velox/connectors/hive/HiveDataSource.h b/velox/connectors/hive/HiveDataSource.h index 3b1bd22c9952..b7ca9bcb6344 100644 --- a/velox/connectors/hive/HiveDataSource.h +++ b/velox/connectors/hive/HiveDataSource.h @@ -102,9 +102,14 @@ class HiveDataSource : public DataSource { protected: virtual std::unique_ptr createSplitReader(); + FileHandleFactory* const fileHandleFactory_; + folly::Executor* const executor_; + const ConnectorQueryCtx* const connectorQueryCtx_; + const std::shared_ptr hiveConfig_; + memory::MemoryPool* const pool_; + std::shared_ptr split_; std::shared_ptr hiveTableHandle_; - memory::MemoryPool* pool_; std::shared_ptr scanSpec_; VectorPtr output_; std::unique_ptr splitReader_; @@ -119,10 +124,6 @@ class HiveDataSource : public DataSource { std::unordered_map> partitionKeys_; - FileHandleFactory* const fileHandleFactory_; - folly::Executor* const executor_; - const ConnectorQueryCtx* const connectorQueryCtx_; - const std::shared_ptr hiveConfig_; std::shared_ptr ioStats_; std::shared_ptr rowIndexColumn_; diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 5b57ae89ddfa..e9bf9387a067 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -90,7 +90,7 @@ std::unique_ptr SplitReader::create( executor, scanSpec); } else { - return std::make_unique( + return std::unique_ptr(new SplitReader( hiveSplit, hiveTableHandle, partitionKeys, @@ -100,7 +100,7 @@ std::unique_ptr SplitReader::create( ioStats, fileHandleFactory, executor, - scanSpec); + scanSpec)); } } @@ -267,14 +267,14 @@ void SplitReader::createReader( setRowIndexColumn(rowIndexColumn); } configureRowReaderOptions( - baseRowReaderOpts_, hiveTableHandle_->tableParameters(), scanSpec_, std::move(metadataFilter), ROW(std::move(columnNames), std::move(columnTypes)), hiveSplit_, hiveConfig_, - connectorQueryCtx_->sessionProperties()); + connectorQueryCtx_->sessionProperties(), + baseRowReaderOpts_); } bool SplitReader::checkIfSplitIsEmpty( diff --git a/velox/connectors/hive/SplitReader.h b/velox/connectors/hive/SplitReader.h index 5cca68c56c94..bf3b0e7c330f 100644 --- a/velox/connectors/hive/SplitReader.h +++ b/velox/connectors/hive/SplitReader.h @@ -67,19 +67,6 @@ class SplitReader { folly::Executor* executor, const std::shared_ptr& scanSpec); - SplitReader( - const std::shared_ptr& hiveSplit, - const std::shared_ptr& hiveTableHandle, - const std::unordered_map>* - partitionKeys, - const ConnectorQueryCtx* connectorQueryCtx, - const std::shared_ptr& hiveConfig, - const RowTypePtr& readerOutputType, - const std::shared_ptr& ioStats, - FileHandleFactory* fileHandleFactory, - folly::Executor* executor, - const std::shared_ptr& scanSpec); - virtual ~SplitReader() = default; void configureReaderOptions( @@ -113,6 +100,19 @@ class SplitReader { std::string toString() const; protected: + SplitReader( + const std::shared_ptr& hiveSplit, + const std::shared_ptr& hiveTableHandle, + const std::unordered_map>* + partitionKeys, + const ConnectorQueryCtx* connectorQueryCtx, + const std::shared_ptr& hiveConfig, + const RowTypePtr& readerOutputType, + const std::shared_ptr& ioStats, + FileHandleFactory* fileHandleFactory, + folly::Executor* executor, + const std::shared_ptr& scanSpec); + /// Create the dwio::common::Reader object baseReader_, which will be used to /// read the data file's metadata and schema void createReader( diff --git a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp index cd1eb9c3702a..94828d136e6b 100644 --- a/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp +++ b/velox/connectors/hive/iceberg/PositionalDeleteFileReader.cpp @@ -121,12 +121,14 @@ PositionalDeleteFileReader::PositionalDeleteFileReader( dwio::common::RowReaderOptions deleteRowReaderOpts; configureRowReaderOptions( - deleteRowReaderOpts, {}, scanSpec, nullptr, deleteFileSchema, - deleteSplit_); + deleteSplit_, + nullptr, + nullptr, + deleteRowReaderOpts); deleteRowReader_.reset(); deleteRowReader_ = deleteReader->createRowReader(deleteRowReaderOpts); diff --git a/velox/dwio/common/BitConcatenation.h b/velox/dwio/common/BitConcatenation.h index a8933936cf8c..18109114ae66 100644 --- a/velox/dwio/common/BitConcatenation.h +++ b/velox/dwio/common/BitConcatenation.h @@ -59,7 +59,7 @@ class BitConcatenation { void setSize() { if (*buffer_) { - (*buffer_)->setSize(bits::roundUp(numBits_, 8) / 8); + (*buffer_)->setSize(bits::divRoundUp(numBits_, 8)); } } diff --git a/velox/dwio/common/BitPackDecoder.cpp b/velox/dwio/common/BitPackDecoder.cpp index 677963d995ec..917b0ab972c6 100644 --- a/velox/dwio/common/BitPackDecoder.cpp +++ b/velox/dwio/common/BitPackDecoder.cpp @@ -200,7 +200,7 @@ void unpack( bitOffset -= rowBias * bitWidth; if (bitOffset < 0) { // Decrement the pointer by enough bytes to have a non-negative bitOffset. - auto bytes = bits::roundUp(-bitOffset, 8) / 8; + auto bytes = bits::divRoundUp(-bitOffset, 8); bitOffset += bytes * 8; bits = reinterpret_cast( reinterpret_cast(bits) - bytes); @@ -217,7 +217,7 @@ void unpack( bool anyUnsafe = false; if (bufferEnd) { const char* endByte = reinterpret_cast(bits) + - bits::roundUp(bitOffset + (rows.back() + 1) * bitWidth, 8) / 8; + bits::divRoundUp(bitOffset + (rows.back() + 1) * bitWidth, 8); // redzone is the number of bytes at the end of the accessed range that // could overflow the buffer if accessed 64 its wide. int64_t redZone = diff --git a/velox/dwio/common/BitPackDecoder.h b/velox/dwio/common/BitPackDecoder.h index 249841c45f86..2aa785e2d29f 100644 --- a/velox/dwio/common/BitPackDecoder.h +++ b/velox/dwio/common/BitPackDecoder.h @@ -803,7 +803,7 @@ inline uint64_t safeLoadBits( return *reinterpret_cast(ptr) >> bitOffset; } int32_t byteWidth = - facebook::velox::bits::roundUp(bitOffset + bitWidth, 8) / 8; + facebook::velox::bits::divRoundUp(bitOffset + bitWidth, 8); return facebook::velox::bits::loadPartialWord( reinterpret_cast(ptr), byteWidth) >> bitOffset; diff --git a/velox/dwio/common/BufferUtil.h b/velox/dwio/common/BufferUtil.h index 38c5a89bc692..738d81efa167 100644 --- a/velox/dwio/common/BufferUtil.h +++ b/velox/dwio/common/BufferUtil.h @@ -34,13 +34,12 @@ inline void ensureCapacity( oldSize = data->size(); if (!data->isMutable() || data->capacity() < BaseVector::byteSize(capacity)) { - oldSize = data->size(); auto newData = AlignedBuffer::allocate(capacity, pool); if (preserveOldData) { std::memcpy( newData->template asMutable(), data->as(), - data->size()); + oldSize); } data = newData; } diff --git a/velox/dwio/common/ColumnLoader.cpp b/velox/dwio/common/ColumnLoader.cpp index 4db934c35bdd..8fd859f7a2b8 100644 --- a/velox/dwio/common/ColumnLoader.cpp +++ b/velox/dwio/common/ColumnLoader.cpp @@ -52,9 +52,9 @@ void ColumnLoader::loadInternal( version_, structReader_->numReads(), "Loading LazyVector after the enclosing reader has moved"); - auto offset = structReader_->lazyVectorReadOffset(); - auto incomingNulls = structReader_->nulls(); - auto outputRows = structReader_->outputRows(); + const auto offset = structReader_->lazyVectorReadOffset(); + const auto* incomingNulls = structReader_->nulls(); + const auto outputRows = structReader_->outputRows(); raw_vector selectedRows; RowSet effectiveRows; ExceptionContextSetter exceptionContext( diff --git a/velox/dwio/common/ColumnLoader.h b/velox/dwio/common/ColumnLoader.h index 4234ff32578d..ed883d23a143 100644 --- a/velox/dwio/common/ColumnLoader.h +++ b/velox/dwio/common/ColumnLoader.h @@ -38,8 +38,8 @@ class ColumnLoader : public velox::VectorLoader { VectorPtr* result) override; private: - SelectiveStructColumnReaderBase* structReader_; - SelectiveColumnReader* fieldReader_; + SelectiveStructColumnReaderBase* const structReader_; + SelectiveColumnReader* const fieldReader_; // This is checked against the version of 'structReader' on load. If // these differ, 'structReader' has been advanced since the creation // of 'this' and 'this' is no longer loadable. diff --git a/velox/dwio/common/ColumnSelector.cpp b/velox/dwio/common/ColumnSelector.cpp index 5074ca55afd9..0ef2715c74c9 100644 --- a/velox/dwio/common/ColumnSelector.cpp +++ b/velox/dwio/common/ColumnSelector.cpp @@ -163,20 +163,17 @@ void ColumnSelector::copy( } } -/** - * Copy the selector tree and apply file schema to handle schema mismatch - */ ColumnSelector ColumnSelector::apply( const std::shared_ptr& origin, const std::shared_ptr& fileSchema) { - // current instance maybe null + // current instance maybe null. if (origin == nullptr) { return ColumnSelector(fileSchema); } // if selector has no schema, we just build a new tree with file schema // selector.getProjection will carry all logic information including nodes - auto onlyFilter = !origin->hasSchema(); + const bool onlyFilter = !origin->hasSchema(); ColumnSelector cs( onlyFilter ? fileSchema : origin->getSchema(), origin->getNodeFilter(), @@ -318,15 +315,15 @@ const FilterTypePtr& ColumnSelector::process(const std::string& column, bool) { std::pair extractColumnName( const std::string_view& name) { // right now this is the only supported expression for MAP key filter - auto pos = name.find('#'); - if (pos != std::string::npos) { - // special map column handling - auto colName = name.substr(0, pos); - auto expr = name.substr(pos + 1); - return std::make_pair(colName, expr); + const auto pos = name.find('#'); + if (pos == std::string::npos) { + return std::make_pair(name, ""); } - return std::make_pair(name, ""); + // special map column handling + const auto colName = name.substr(0, pos); + const auto expr = name.substr(pos + 1); + return std::make_pair(colName, expr); } void ColumnSelector::logFilter() const { diff --git a/velox/dwio/common/ColumnSelector.h b/velox/dwio/common/ColumnSelector.h index f7997bb62b4b..62408e55a213 100644 --- a/velox/dwio/common/ColumnSelector.h +++ b/velox/dwio/common/ColumnSelector.h @@ -134,8 +134,8 @@ class ColumnSelector { checkSelectColNonDuplicate(fileColumnNamesReadAsLowerCase); } - // set a specific node to read state - // only means we only enable exact the node only. + /// Sets a specific node to read state + /// only means we only enable exact the node only. void setRead(const FilterTypePtr& node, bool only = false); /** @@ -145,7 +145,8 @@ class ColumnSelector { * @return the id in the tree */ const FilterTypePtr& getNode(size_t id) const { - DWIO_ENSURE(inRange(id), "node is out of range"); + VELOX_CHECK( + inRange(id), "node: {} is out of range of {}", id, nodes_.size()); return nodes_[id]; } @@ -252,23 +253,23 @@ class ColumnSelector { return FilterType::getInvalid(); } - // build selected schema based on current filter + /// Builds selected schema based on current filter. std::shared_ptr buildSelected() const; - // build selected schema based on current filter and reorder columns according - // to what filter specifies + /// Builds selected schema based on current filter and reorder columns + /// according to what filter specifies. std::shared_ptr buildSelectedReordered() const; - // build a column filter out of filter tree - // This only returns top columns today and can be extended to node level + /// Build a column filter out of filter tree. + /// This only returns top columns today and can be extended to node level const ColumnFilter& getProjection() const; - // a filter lambda function accept column index for query + /// A filter lambda function accept column index for query. std::function getFilter() const { return [this](uint64_t column) { return shouldReadColumn(column); }; } - // this is essentially the effective schema when column selector was built + /// This is essentially the effective schema when column selector was built. bool hasSchema() const { return schema_ != nullptr; } @@ -288,7 +289,7 @@ class ColumnSelector { const std::vector& keys, const std::vector& values); - // Create a file selector based on a logic selector and disk schema + /// Creates a file selector based on a logic selector and disk schema. static ColumnSelector apply( const std::shared_ptr& origin, const std::shared_ptr& fileSchema); diff --git a/velox/dwio/common/ColumnVisitors.h b/velox/dwio/common/ColumnVisitors.h index 73f4a04c0bf8..8c4764736d75 100644 --- a/velox/dwio/common/ColumnVisitors.h +++ b/velox/dwio/common/ColumnVisitors.h @@ -49,11 +49,11 @@ struct DropValues { } }; -struct ExtractToReader { +class ExtractToReader { + public: using HookType = dwio::common::NoHook; static constexpr bool kSkipNulls = false; - explicit ExtractToReader(SelectiveColumnReader* readerIn) - : reader_(readerIn) {} + explicit ExtractToReader(SelectiveColumnReader* reader) : reader_(reader) {} bool acceptsNulls() const { return true; @@ -74,7 +74,7 @@ struct ExtractToReader { } private: - SelectiveColumnReader* reader_; + SelectiveColumnReader* const reader_; }; template @@ -162,7 +162,7 @@ class ColumnVisitor { TFilter& filter, SelectiveColumnReader* reader, const RowSet& rows, - ExtractValues values) + const ExtractValues& values) : filter_(filter), reader_(reader), allowNulls_(!TFilter::deterministic || filter.testNull()), @@ -314,7 +314,7 @@ class ColumnVisitor { FOLLY_ALWAYS_INLINE vector_size_t process(T value, bool& atEnd) { if (!TFilter::deterministic) { - auto previous = currentRow(); + const auto previous = currentRow(); if (velox::common::applyFilter(filter_, value)) { filterPassed(value); } else { @@ -326,6 +326,7 @@ class ColumnVisitor { } return currentRow() - previous - 1; } + // The filter passes or fails and we go to the next row if any. if (velox::common::applyFilter(filter_, value)) { filterPassed(value); @@ -501,8 +502,8 @@ class ColumnVisitor { template FOLLY_ALWAYS_INLINE void ColumnVisitor::filterFailed() { - auto preceding = filter_.getPrecedingPositionsToFail(); - auto succeeding = filter_.getSucceedingPositionsToFail(); + const auto preceding = filter_.getPrecedingPositionsToFail(); + const auto succeeding = filter_.getSucceedingPositionsToFail(); if (preceding) { reader_->dropResults(preceding); } @@ -717,18 +718,18 @@ class DictionaryColumnVisitor DictionaryColumnVisitor( TFilter& filter, SelectiveColumnReader* reader, - RowSet rows, - ExtractValues values) + const RowSet& rows, + const ExtractValues& values) : ColumnVisitor( filter, reader, rows, values), - state_(reader->scanState().rawState), width_( reader->fileType().type()->kind() == TypeKind::BIGINT ? 8 : reader->fileType().type()->kind() == TypeKind::INTEGER ? 4 - : 2) {} + : 2), + state_(reader->scanState().rawState) {} FOLLY_ALWAYS_INLINE bool isInDict() { if (inDict()) { @@ -753,9 +754,10 @@ class DictionaryColumnVisitor } return super::process(signedValue, atEnd); } - vector_size_t previous = + + const vector_size_t previous = isDense && TFilter::deterministic ? 0 : super::currentRow(); - T valueInDictionary = dict()[value]; + const T valueInDictionary = dict()[value]; if constexpr (!hasFilter()) { super::filterPassed(valueInDictionary); } else { @@ -781,6 +783,7 @@ class DictionaryColumnVisitor } } } + if (++super::rowIndex_ >= super::numRows_) { atEnd = true; return (isDense && TFilter::deterministic) @@ -1108,8 +1111,8 @@ class DictionaryColumnVisitor !std::is_same_v; } - RawScanState state_; const uint8_t width_; + RawScanState state_; }; template diff --git a/velox/dwio/common/DirectBufferedInput.h b/velox/dwio/common/DirectBufferedInput.h index cf81d9744d47..561b2d795f59 100644 --- a/velox/dwio/common/DirectBufferedInput.h +++ b/velox/dwio/common/DirectBufferedInput.h @@ -156,12 +156,11 @@ class DirectBufferedInput : public BufferedInput { } virtual std::unique_ptr clone() const override { - std::unique_ptr input(new DirectBufferedInput( + return std::unique_ptr(new DirectBufferedInput( input_, fileNum_, tracker_, groupId_, ioStats_, executor_, options_)); - return input; } - memory::MemoryPool* pool() { + memory::MemoryPool* pool() const { return pool_; } diff --git a/velox/dwio/common/FilterNode.h b/velox/dwio/common/FilterNode.h index 894fa3550dbc..5aafd0964e28 100644 --- a/velox/dwio/common/FilterNode.h +++ b/velox/dwio/common/FilterNode.h @@ -155,21 +155,6 @@ using ColumnFilter = std::vector; class FilterType; using FilterTypePtr = std::shared_ptr; class FilterType { - private: - const FilterNode node_; - const std::weak_ptr parent_; - std::vector children_; - // a flat to decide if current node is needed - bool read_; - // a flag to indicate if current node is in content - bool inContent_; - // request type in the filter tree node - std::shared_ptr requestType_; - // data type in the filter tree node - std::shared_ptr fileType_; - // sequence filter for given node - empty if no filter - SeqFilter seqFilter_; - public: // a single value indicating not found (invalid node) static const FilterTypePtr& getInvalid() { @@ -214,7 +199,6 @@ class FilterType { return node_; } - public: inline void setRead() { read_ = true; } @@ -256,10 +240,10 @@ class FilterType { return requestType_->kind(); } - // return node ID in the type tree + /// Returns node ID in the type tree inline uint64_t getId() const { // Cannot get ID for invalid node - DWIO_ENSURE_EQ(valid(), true); + VELOX_CHECK(valid()); return node_.node; } @@ -287,6 +271,21 @@ class FilterType { return seqFilter_->empty() || seqFilter_->find(sequence) != seqFilter_->end(); } + + private: + const FilterNode node_; + const std::weak_ptr parent_; + std::vector children_; + // a flat to decide if current node is needed + bool read_; + // a flag to indicate if current node is in content + bool inContent_; + // request type in the filter tree node + std::shared_ptr requestType_; + // data type in the filter tree node + std::shared_ptr fileType_; + // sequence filter for given node - empty if no filter + SeqFilter seqFilter_; }; } // namespace facebook::velox::dwio::common diff --git a/velox/dwio/common/MetadataFilter.cpp b/velox/dwio/common/MetadataFilter.cpp index ed0827d65487..62bf79107e10 100644 --- a/velox/dwio/common/MetadataFilter.cpp +++ b/velox/dwio/common/MetadataFilter.cpp @@ -214,6 +214,7 @@ void MetadataFilter::eval( if (!root_) { return; } + LeafResults leafResults; for (auto& [leaf, result] : leafNodeResults) { VELOX_CHECK_EQ( @@ -226,7 +227,7 @@ void MetadataFilter::eval( "Duplicate results: {}", leaf->field().toString()); } - auto bitCount = finalResult.size() * 64; + const auto bitCount = finalResult.size() * 64; if (auto* combined = root_->eval(leafResults, bitCount)) { bits::orBits(finalResult.data(), combined, 0, bitCount); } diff --git a/velox/dwio/common/OnDemandUnitLoader.cpp b/velox/dwio/common/OnDemandUnitLoader.cpp index 59190d336ac3..d4ef4f0a5ef2 100644 --- a/velox/dwio/common/OnDemandUnitLoader.cpp +++ b/velox/dwio/common/OnDemandUnitLoader.cpp @@ -42,13 +42,13 @@ class OnDemandUnitLoader : public UnitLoader { LoadUnit& getLoadedUnit(uint32_t unit) override { VELOX_CHECK_LT(unit, loadUnits_.size(), "Unit out of range"); - if (loadedUnit_) { - if (*loadedUnit_ == unit) { + if (loadedUnit_.has_value()) { + if (loadedUnit_.value() == unit) { return *loadUnits_[unit]; - } else { - loadUnits_[*loadedUnit_]->unload(); - loadedUnit_.reset(); } + + loadUnits_[*loadedUnit_]->unload(); + loadedUnit_.reset(); } { @@ -74,8 +74,8 @@ class OnDemandUnitLoader : public UnitLoader { } private: - std::vector> loadUnits_; - std::function + const std::vector> loadUnits_; + const std::function blockedOnIoCallback_; std::optional loadedUnit_; }; diff --git a/velox/dwio/common/Options.h b/velox/dwio/common/Options.h index 27864a6de6a3..90c482230240 100644 --- a/velox/dwio/common/Options.h +++ b/velox/dwio/common/Options.h @@ -121,57 +121,12 @@ class FormatSpecificOptions { /// Options for creating a RowReader. class RowReaderOptions { - private: - uint64_t dataStart; - uint64_t dataLength; - bool preloadStripe; - bool projectSelectedType; - bool returnFlatVector_ = false; - ErrorTolerance errorTolerance_; - std::shared_ptr selector_; - RowTypePtr requestedType_; - std::shared_ptr scanSpec_ = nullptr; - std::shared_ptr metadataFilter_; - // Node id for map column to a list of keys to be projected as a struct. - std::unordered_map> flatmapNodeIdAsStruct_; - // Optional executors to enable internal reader parallelism. - // 'decodingExecutor' allow parallelising the vector decoding process. - // 'ioExecutor' enables parallelism when performing file system read - // operations. - std::shared_ptr decodingExecutor_; - size_t decodingParallelismFactor_{0}; - std::optional rowNumberColumnInfo_ = std::nullopt; - - // Function to populate metrics related to feature projection stats - // in Koski. This gets fired in FlatMapColumnReader. - // This is a bit of a hack as there is (by design) no good way - // To propogate information from column reader to Koski - std::function - keySelectionCallback_; - - // Function to track how much time we spend waiting on IO before reading rows - // (in dwrf row reader). todo: encapsulate this and keySelectionCallBack_ in a - // struct - std::function - blockedOnIoCallback_; - std::function - decodingTimeCallback_; - std::function stripeCountCallback_; - bool eagerFirstStripeLoad = true; - uint64_t skipRows_ = 0; - std::shared_ptr unitLoaderFactory_; - - TimestampPrecision timestampPrecision_ = TimestampPrecision::kMilliseconds; - - std::shared_ptr formatSpecificOptions_; - public: RowReaderOptions() noexcept - : dataStart(0), - dataLength(std::numeric_limits::max()), - preloadStripe(false), - projectSelectedType(false) {} + : dataStart_(0), + dataLength_(std::numeric_limits::max()), + preloadStripe_(false), + projectSelectedType_(false) {} /// For files that have structs as the top-level object, select the fields /// to read. The first field is 0, the second 1, and so on. By default, @@ -188,83 +143,83 @@ class RowReaderOptions { return *this; } - /// Set the section of the file to process. + /// Sets the section of the file to process. /// @param offset the starting byte offset /// @param length the number of bytes to read /// @return this RowReaderOptions& range(uint64_t offset, uint64_t length) { - dataStart = offset; - dataLength = length; + dataStart_ = offset; + dataLength_ = length; return *this; } - /// Get the list of selected field or type ids to read. - const std::shared_ptr& getSelector() const { + /// Gets the list of selected field or type ids to read. + const std::shared_ptr& selector() const { return selector_; } - /// Get the start of the range for the data being processed. + /// Gets the start of the range for the data being processed. /// @return if not set, return 0 - uint64_t getOffset() const { - return dataStart; + uint64_t offset() const { + return dataStart_; } - /// Get the length of the range for the data being processed. + /// Gets the length of the range for the data being processed. /// @return if not set, return the maximum unsigned long. - uint64_t getLength() const { - return dataLength; + uint64_t length() const { + return dataLength_; } - /// Get the limit of the range (lowest offset not in the range). + /// Gets the limit of the range (lowest offset not in the range). /// @return if not set, return the maximum unsigned long. - uint64_t getLimit() const { - return ((std::numeric_limits::max() - dataStart) > dataLength) - ? (dataStart + dataLength) + uint64_t limit() const { + return ((std::numeric_limits::max() - dataStart_) > dataLength_) + ? (dataStart_ + dataLength_) : std::numeric_limits::max(); } - /// Request that stripes be pre-loaded. + /// Requests that stripes be pre-loaded. void setPreloadStripe(bool preload) { - preloadStripe = preload; + preloadStripe_ = preload; } /// Are stripes to be pre-loaded? - bool getPreloadStripe() const { - return preloadStripe; + bool preloadStripe() const { + return preloadStripe_; } /// Will load the first stripe on RowReader creation, if true. /// This behavior is already happening in DWRF, but isn't desired for some use /// cases. So this flag allows us to turn it off. void setEagerFirstStripeLoad(bool load) { - eagerFirstStripeLoad = load; + eagerFirstStripeLoad_ = load; } /// Will load the first stripe on RowReader creation, if true. /// This behavior is already happening in DWRF, but isn't desired for some use /// cases. So this flag allows us to turn it off. - bool getEagerFirstStripeLoad() const { - return eagerFirstStripeLoad; + bool eagerFirstStripeLoad() const { + return eagerFirstStripeLoad_; } - // For flat map, return flat vector representation - bool getReturnFlatVector() const { + /// For flat map, return flat vector representation + bool returnFlatVector() const { return returnFlatVector_; } - // For flat map, request that flat vector representation is used + /// For flat map, request that flat vector representation is used void setReturnFlatVector(bool value) { returnFlatVector_ = value; } - /// Request that the selected type be projected. - void setProjectSelectedType(bool vProjectSelectedType) { - projectSelectedType = vProjectSelectedType; + /// Requests that the selected type be projected. + void setProjectSelectedType(bool value) { + projectSelectedType_ = value; } /// Is the selected type to be projected? - bool getProjectSelectedType() const { - return projectSelectedType; + bool projectSelectedType() const { + return projectSelectedType_; } /// Set RowReader error tolerance. @@ -273,7 +228,7 @@ class RowReaderOptions { } /// Get RowReader error tolerance. - const ErrorTolerance& getErrorTolerance() const { + const ErrorTolerance& errorTolerance() const { return errorTolerance_; } @@ -286,7 +241,7 @@ class RowReaderOptions { requestedType_ = std::move(requestedType); } - const std::shared_ptr& getScanSpec() const { + const std::shared_ptr& scanSpec() const { return scanSpec_; } @@ -294,8 +249,7 @@ class RowReaderOptions { scanSpec_ = std::move(scanSpec); } - const std::shared_ptr& getMetadataFilter() - const { + const std::shared_ptr& metadataFilter() const { return metadataFilter_; } @@ -317,7 +271,7 @@ class RowReaderOptions { } const std::unordered_map>& - getMapColumnIdAsStruct() const { + mapColumnIdAsStruct() const { return flatmapNodeIdAsStruct_; } @@ -334,7 +288,7 @@ class RowReaderOptions { rowNumberColumnInfo_ = std::move(rowNumberColumnInfo); } - const std::optional& getRowNumberColumnInfo() const { + const std::optional& rowNumberColumnInfo() const { return rowNumberColumnInfo_; } @@ -347,7 +301,7 @@ class RowReaderOptions { const std::function< void(facebook::velox::dwio::common::flatmap::FlatMapKeySelectionStats)> - getKeySelectionCallback() const { + keySelectionCallback() const { return keySelectionCallback_; } @@ -358,7 +312,7 @@ class RowReaderOptions { } const std::function - getBlockedOnIoCallback() const { + blockedOnIoCallback() const { return blockedOnIoCallback_; } @@ -369,7 +323,7 @@ class RowReaderOptions { } std::function - getDecodingTimeCallback() const { + decodingTimeCallback() const { return decodingTimeCallback_; } @@ -378,7 +332,7 @@ class RowReaderOptions { stripeCountCallback_ = std::move(stripeCountCallback); } - std::function getStripeCountCallback() const { + std::function stripeCountCallback() const { return stripeCountCallback_; } @@ -386,7 +340,7 @@ class RowReaderOptions { skipRows_ = skipRows; } - uint64_t getSkipRows() const { + uint64_t skipRows() const { return skipRows_; } @@ -395,15 +349,15 @@ class RowReaderOptions { unitLoaderFactory_ = std::move(unitLoaderFactory); } - const std::shared_ptr& getUnitLoaderFactory() const { + const std::shared_ptr& unitLoaderFactory() const { return unitLoaderFactory_; } - const std::shared_ptr& getDecodingExecutor() const { + const std::shared_ptr& decodingExecutor() const { return decodingExecutor_; } - size_t getDecodingParallelismFactor() const { + size_t decodingParallelismFactor() const { return decodingParallelismFactor_; } @@ -423,6 +377,52 @@ class RowReaderOptions { std::shared_ptr options) { formatSpecificOptions_ = std::move(options); } + + private: + uint64_t dataStart_; + uint64_t dataLength_; + bool preloadStripe_; + bool projectSelectedType_; + bool returnFlatVector_ = false; + ErrorTolerance errorTolerance_; + std::shared_ptr selector_; + RowTypePtr requestedType_; + std::shared_ptr scanSpec_{nullptr}; + std::shared_ptr metadataFilter_; + // Node id for map column to a list of keys to be projected as a struct. + std::unordered_map> flatmapNodeIdAsStruct_; + // Optional executors to enable internal reader parallelism. + // 'decodingExecutor' allow parallelising the vector decoding process. + // 'ioExecutor' enables parallelism when performing file system read + // operations. + std::shared_ptr decodingExecutor_; + size_t decodingParallelismFactor_{0}; + std::optional rowNumberColumnInfo_{std::nullopt}; + + // Function to populate metrics related to feature projection stats + // in Koski. This gets fired in FlatMapColumnReader. + // This is a bit of a hack as there is (by design) no good way + // To propogate information from column reader to Koski + std::function + keySelectionCallback_; + + // Function to track how much time we spend waiting on IO before reading rows + // (in dwrf row reader). todo: encapsulate this and keySelectionCallBack_ in a + // struct + std::function + blockedOnIoCallback_; + std::function + decodingTimeCallback_; + std::function stripeCountCallback_; + bool eagerFirstStripeLoad_{true}; + uint64_t skipRows_{0}; + + std::shared_ptr unitLoaderFactory_; + + TimestampPrecision timestampPrecision_ = TimestampPrecision::kMilliseconds; + + std::shared_ptr formatSpecificOptions_; }; /// Options for creating a Reader. diff --git a/velox/dwio/common/Reader.cpp b/velox/dwio/common/Reader.cpp index 43b051212721..f7fa7709801d 100644 --- a/velox/dwio/common/Reader.cpp +++ b/velox/dwio/common/Reader.cpp @@ -156,9 +156,9 @@ void RowReader::readWithRowNumber( const dwio::common::Mutation* mutation, VectorPtr& result) { auto* rowVector = result->asUnchecked(); - column_index_t numChildren = 0; - column_index_t numConstChildren = 0; - for (auto& column : options.getScanSpec()->children()) { + column_index_t numChildren{0}; + column_index_t numConstChildren{0}; + for (auto& column : options.scanSpec()->children()) { if (column->projectOut()) { ++numChildren; if (column->isConstant()) { @@ -166,16 +166,17 @@ void RowReader::readWithRowNumber( } } } + VectorPtr rowNumVector; - auto& rowNumberColumnInfo = options.getRowNumberColumnInfo(); + const auto& rowNumberColumnInfo = options.rowNumberColumnInfo(); VELOX_CHECK(rowNumberColumnInfo.has_value()); - auto rowNumberColumnIndex = rowNumberColumnInfo->insertPosition; - auto& rowNumberColumnName = rowNumberColumnInfo->name; + const auto rowNumberColumnIndex = rowNumberColumnInfo->insertPosition; + const auto& rowNumberColumnName = rowNumberColumnInfo->name; VELOX_CHECK_LE(rowNumberColumnIndex, numChildren); if (rowVector->childrenSize() != numChildren) { VELOX_CHECK_EQ(rowVector->childrenSize(), numChildren + 1); rowNumVector = rowVector->childAt(rowNumberColumnIndex); - auto& rowType = rowVector->type()->asRow(); + const auto& rowType = rowVector->type()->asRow(); auto names = rowType.names(); auto types = rowType.children(); auto children = rowVector->children(); @@ -190,8 +191,9 @@ void RowReader::readWithRowNumber( rowVector->size(), std::move(children)); } + columnReader->next(rowsToRead, result, mutation); - FlatVector* flatRowNum = nullptr; + FlatVector* flatRowNum{nullptr}; if (rowNumVector && BaseVector::isVectorWritable(rowNumVector)) { flatRowNum = rowNumVector->asFlatVector(); } @@ -209,16 +211,17 @@ void RowReader::readWithRowNumber( flatRowNum = rowNumVector->asUnchecked>(); } auto* rawRowNum = flatRowNum->mutableRawValues(); - if (numChildren == numConstChildren && !hasDeletion(mutation)) { + if ((numChildren == numConstChildren) && !hasDeletion(mutation)) { VELOX_DCHECK_EQ(rowsToRead, result->size()); std::iota(rawRowNum, rawRowNum + rowsToRead, previousRow); } else { - auto rowOffsets = columnReader->outputRows(); + const auto rowOffsets = columnReader->outputRows(); VELOX_DCHECK_EQ(rowOffsets.size(), result->size()); for (int i = 0; i < rowOffsets.size(); ++i) { rawRowNum[i] = previousRow + rowOffsets[i]; } } + rowVector = result->asUnchecked(); auto& rowType = rowVector->type()->asRow(); auto names = rowType.names(); diff --git a/velox/dwio/common/ScanSpec.cpp b/velox/dwio/common/ScanSpec.cpp index 862f8d782e13..a37cbc98ec6a 100644 --- a/velox/dwio/common/ScanSpec.cpp +++ b/velox/dwio/common/ScanSpec.cpp @@ -31,10 +31,10 @@ ScanSpec* ScanSpec::getOrCreateChild(const std::string& name) { } ScanSpec* ScanSpec::getOrCreateChild(const Subfield& subfield) { - auto container = this; - auto& path = subfield.path(); + auto* container = this; + const auto& path = subfield.path(); for (size_t depth = 0; depth < path.size(); ++depth) { - auto element = path[depth].get(); + const auto element = path[depth].get(); VELOX_CHECK_EQ(element->kind(), kNestedField); auto* nestedField = static_cast(element); container = container->getOrCreateChild(nestedField->name()); @@ -43,7 +43,7 @@ ScanSpec* ScanSpec::getOrCreateChild(const Subfield& subfield) { } uint64_t ScanSpec::newRead() { - if (!numReads_) { + if (numReads_ == 0) { reorder(); } else if (enableFilterReorder_) { for (auto i = 1; i < children_.size(); ++i) { @@ -57,13 +57,14 @@ uint64_t ScanSpec::newRead() { } } } - return numReads_++; + return ++numReads_; } void ScanSpec::reorder() { if (children_.empty()) { return; } + // Make sure 'stableChildren_' is initialized. stableChildren(); std::sort( @@ -267,8 +268,8 @@ bool testStringFilter( bool testBoolFilter( common::Filter* filter, dwio::common::BooleanColumnStatistics* boolStats) { - auto trueCount = boolStats->getTrueCount(); - auto falseCount = boolStats->getFalseCount(); + const auto trueCount = boolStats->getTrueCount(); + const auto falseCount = boolStats->getFalseCount(); if (trueCount.has_value() && falseCount.has_value()) { if (trueCount.value() == 0) { if (!filter->testBool(false)) { @@ -290,7 +291,7 @@ bool testFilter( dwio::common::ColumnStatistics* stats, uint64_t totalRows, const TypePtr& type) { - bool mayHaveNull = true; + bool mayHaveNull{true}; // Has-null statistics is often not set. Hence, we supplement it with // number-of-values statistic to detect no-null columns more often. @@ -308,6 +309,7 @@ bool testFilter( // IS NULL filter cannot pass. return false; } + if (mayHaveNull && filter->testNull()) { return true; } @@ -319,23 +321,23 @@ bool testFilter( case TypeKind::INTEGER: case TypeKind::SMALLINT: case TypeKind::TINYINT: { - auto intStats = + auto* intStats = dynamic_cast(stats); return testIntFilter(filter, intStats, mayHaveNull); } case TypeKind::REAL: case TypeKind::DOUBLE: { - auto doubleStats = + auto* doubleStats = dynamic_cast(stats); return testDoubleFilter(filter, doubleStats, mayHaveNull); } case TypeKind::BOOLEAN: { - auto boolStats = + auto* boolStats = dynamic_cast(stats); return testBoolFilter(filter, boolStats); } case TypeKind::VARCHAR: { - auto stringStats = + auto* stringStats = dynamic_cast(stats); return testStringFilter(filter, stringStats, mayHaveNull); } diff --git a/velox/dwio/common/SeekableInputStream.cpp b/velox/dwio/common/SeekableInputStream.cpp index d59ff6001091..7773445dea5d 100644 --- a/velox/dwio/common/SeekableInputStream.cpp +++ b/velox/dwio/common/SeekableInputStream.cpp @@ -37,7 +37,7 @@ void printBuffer(std::ostream& out, const char* buffer, uint64_t length) { } uint64_t PositionProvider::next() { - uint64_t result = *position_; + const uint64_t result = *position_; ++position_; return result; } diff --git a/velox/dwio/common/SelectiveByteRleColumnReader.cpp b/velox/dwio/common/SelectiveByteRleColumnReader.cpp index e649a1d425f0..97b00a71e153 100644 --- a/velox/dwio/common/SelectiveByteRleColumnReader.cpp +++ b/velox/dwio/common/SelectiveByteRleColumnReader.cpp @@ -18,7 +18,9 @@ namespace facebook::velox::dwio::common { -void SelectiveByteRleColumnReader::getValues(RowSet rows, VectorPtr* result) { +void SelectiveByteRleColumnReader::getValues( + const RowSet& rows, + VectorPtr* result) { switch (requestedType_->kind()) { case TypeKind::BOOLEAN: getFlatValues(rows, result, requestedType_); diff --git a/velox/dwio/common/SelectiveByteRleColumnReader.h b/velox/dwio/common/SelectiveByteRleColumnReader.h index 3818edeebad2..407cde02ccf5 100644 --- a/velox/dwio/common/SelectiveByteRleColumnReader.h +++ b/velox/dwio/common/SelectiveByteRleColumnReader.h @@ -37,7 +37,7 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader { return false; } - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; template < typename Reader, @@ -47,10 +47,10 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader { void processFilter( velox::common::Filter* filter, ExtractValues extractValues, - RowSet rows); + const RowSet& rows); template - void processValueHook(RowSet rows, ValueHook* hook); + void processValueHook(const RowSet& rows, ValueHook* hook); template < typename Reader, @@ -59,12 +59,14 @@ class SelectiveByteRleColumnReader : public SelectiveColumnReader { typename ExtractValues> void readHelper( velox::common::Filter* filter, - RowSet rows, - ExtractValues extractValues); + const RowSet& rows, + const ExtractValues& extractValues); template - void - readCommon(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); + void readCommon( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls); }; template < @@ -74,8 +76,8 @@ template < typename ExtractValues> void SelectiveByteRleColumnReader::readHelper( velox::common::Filter* filter, - RowSet rows, - ExtractValues extractValues) { + const RowSet& rows, + const ExtractValues& extractValues) { reinterpret_cast(this)->readWithVisitor( rows, ColumnVisitor( @@ -90,7 +92,7 @@ template < void SelectiveByteRleColumnReader::processFilter( velox::common::Filter* filter, ExtractValues extractValues, - RowSet rows) { + const RowSet& rows) { using velox::common::FilterKind; switch (filter ? filter->kind() : FilterKind::kAlwaysTrue) { case FilterKind::kAlwaysTrue: @@ -145,7 +147,7 @@ void SelectiveByteRleColumnReader::processFilter( template void SelectiveByteRleColumnReader::processValueHook( - RowSet rows, + const RowSet& rows, ValueHook* hook) { using namespace facebook::velox::aggregate; switch (hook->kind()) { @@ -162,10 +164,10 @@ void SelectiveByteRleColumnReader::processValueHook( template void SelectiveByteRleColumnReader::readCommon( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); - bool isDense = rows.back() == rows.size() - 1; + const bool isDense = rows.back() == rows.size() - 1; velox::common::Filter* filter = scanSpec_->filter() ? scanSpec_->filter() : &dwio::common::alwaysTrue(); if (scanSpec_->keepValues()) { diff --git a/velox/dwio/common/SelectiveColumnReader.cpp b/velox/dwio/common/SelectiveColumnReader.cpp index b6be330b88fb..c7a3ad67a4e2 100644 --- a/velox/dwio/common/SelectiveColumnReader.cpp +++ b/velox/dwio/common/SelectiveColumnReader.cpp @@ -46,7 +46,7 @@ SelectiveColumnReader::SelectiveColumnReader( std::shared_ptr fileType, dwio::common::FormatParams& params, velox::common::ScanSpec& scanSpec) - : memoryPool_(params.pool()), + : memoryPool_(¶ms.pool()), requestedType_(requestedType), fileType_(fileType), formatData_(params.toFormatData(fileType, scanSpec)), @@ -70,13 +70,13 @@ void SelectiveColumnReader::seekTo(vector_size_t offset, bool readsNullsOnly) { return; } if (readOffset_ < offset) { - if (numParentNulls_) { + if (numParentNulls_ > 0) { VELOX_CHECK_LE( parentNullsRecordedTo_, offset, "Must not seek to before parentNullsRecordedTo_"); } - auto distance = offset - readOffset_ - numParentNulls_; + const auto distance = offset - readOffset_ - numParentNulls_; numParentNulls_ = 0; parentNullsRecordedTo_ = 0; if (readsNullsOnly) { @@ -86,14 +86,17 @@ void SelectiveColumnReader::seekTo(vector_size_t offset, bool readsNullsOnly) { } readOffset_ = offset; } else { - VELOX_FAIL("Seeking backward on a ColumnReader"); + VELOX_FAIL( + "Seeking backward on a ColumnReader from {} to {}", + readOffset_, + offset); } } -void SelectiveColumnReader::initReturnReaderNulls(RowSet rows) { +void SelectiveColumnReader::initReturnReaderNulls(const RowSet& rows) { if (useBulkPath() && !scanSpec_->hasFilter()) { anyNulls_ = nullsInReadRange_ != nullptr; - bool isDense = rows.back() == rows.size() - 1; + const bool isDense = rows.back() == rows.size() - 1; returnReaderNulls_ = anyNulls_ && isDense; } else { returnReaderNulls_ = false; @@ -101,25 +104,27 @@ void SelectiveColumnReader::initReturnReaderNulls(RowSet rows) { } void SelectiveColumnReader::prepareNulls( - RowSet rows, + const RowSet& rows, bool hasNulls, int32_t extraRows) { if (!hasNulls) { anyNulls_ = false; return; } + initReturnReaderNulls(rows); if (returnReaderNulls_) { // No need for null flags if fast path. return; } - auto numRows = rows.size() + extraRows; + + const auto numRows = rows.size() + extraRows; if (resultNulls_ && resultNulls_->unique() && resultNulls_->capacity() >= bits::nbytes(numRows) + simd::kPadding) { resultNulls_->setSize(bits::nbytes(numRows)); } else { resultNulls_ = AlignedBuffer::allocate( - numRows + (simd::kPadding * 8), &memoryPool_); + numRows + (simd::kPadding * 8), memoryPool_); rawResultNulls_ = resultNulls_->asMutable(); } anyNulls_ = false; @@ -128,7 +133,7 @@ void SelectiveColumnReader::prepareNulls( simd::memset(rawResultNulls_, bits::kNotNullByte, resultNulls_->capacity()); } -const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) { +const uint64_t* SelectiveColumnReader::shouldMoveNulls(const RowSet& rows) { if (rows.size() == numValues_ || !anyNulls_) { // Nulls will only be moved if there is a selection on values. A cast // alone does not move nulls. @@ -139,7 +144,7 @@ const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) { if (!(resultNulls_ && resultNulls_->unique() && resultNulls_->capacity() >= rows.size() + simd::kPadding)) { resultNulls_ = AlignedBuffer::allocate( - rows.size() + (simd::kPadding * 8), &memoryPool_); + rows.size() + (simd::kPadding * 8), memoryPool_); rawResultNulls_ = resultNulls_->asMutable(); } moveFrom = nullsInReadRange_->as(); @@ -151,8 +156,9 @@ const uint64_t* SelectiveColumnReader::shouldMoveNulls(RowSet rows) { return moveFrom; } -void SelectiveColumnReader::setComplexNulls(RowSet rows, VectorPtr& result) - const { +void SelectiveColumnReader::setComplexNulls( + const RowSet& rows, + VectorPtr& result) const { if (!nullsInReadRange_) { if (result->isNullsWritable()) { result->clearNulls(0, rows.size()); @@ -183,7 +189,7 @@ void SelectiveColumnReader::setComplexNulls(RowSet rows, VectorPtr& result) } void SelectiveColumnReader::getIntValues( - RowSet rows, + const RowSet& rows, const TypePtr& requestedType, VectorPtr* result) { switch (requestedType->kind()) { @@ -254,7 +260,7 @@ void SelectiveColumnReader::getIntValues( } void SelectiveColumnReader::getUnsignedIntValues( - RowSet rows, + const RowSet& rows, const TypePtr& requestedType, VectorPtr* result) { switch (requestedType->kind()) { @@ -324,7 +330,7 @@ void SelectiveColumnReader::getUnsignedIntValues( template <> void SelectiveColumnReader::getFlatValues( - RowSet rows, + const RowSet& rows, VectorPtr* result, const TypePtr& type, bool isFinal) { @@ -332,7 +338,7 @@ void SelectiveColumnReader::getFlatValues( VELOX_CHECK_EQ(valueSize_, sizeof(int8_t)); compactScalarValues(rows, isFinal); auto boolValues = - AlignedBuffer::allocate(numValues_, &memoryPool_, false); + AlignedBuffer::allocate(numValues_, memoryPool_, false); auto rawBytes = values_->as(); auto zero = xsimd::broadcast(0); if constexpr (kWidth == 32) { @@ -350,7 +356,7 @@ void SelectiveColumnReader::getFlatValues( } } *result = std::make_shared>( - &memoryPool_, + memoryPool_, type, resultNulls(), numValues_, @@ -360,7 +366,7 @@ void SelectiveColumnReader::getFlatValues( template <> void SelectiveColumnReader::compactScalarValues( - RowSet rows, + const RowSet& rows, bool isFinal) { if (!values_ || rows.size() == numValues_) { if (values_) { @@ -377,7 +383,7 @@ void SelectiveColumnReader::compactScalarValues( continue; } - VELOX_DCHECK(outputRows_[i] == nextRow); + VELOX_DCHECK_EQ(outputRows_[i], nextRow); bits::setBit(rawBits, rowIndex, bits::isBitSet(rawBits, i)); if (moveNullsFrom && rowIndex != i) { @@ -401,7 +407,7 @@ char* SelectiveColumnReader::copyStringValue(folly::StringPiece value) { uint64_t size = value.size(); if (stringBuffers_.empty() || rawStringUsed_ + size > rawStringSize_) { auto bytes = std::max(size, kStringBufferSize); - BufferPtr buffer = AlignedBuffer::allocate(bytes, &memoryPool_); + BufferPtr buffer = AlignedBuffer::allocate(bytes, memoryPool_); // Use the preferred size instead of the requested one to improve memory // efficiency. buffer->setSize(buffer->capacity()); @@ -451,8 +457,8 @@ void SelectiveColumnReader::resetFilterCaches() { void SelectiveColumnReader::addParentNulls( int32_t firstRowInNulls, const uint64_t* nulls, - RowSet rows) { - int32_t firstNullIndex = + const RowSet& rows) { + const int32_t firstNullIndex = readOffset_ < firstRowInNulls ? 0 : readOffset_ - firstRowInNulls; numParentNulls_ += nulls ? bits::countNulls(nulls, firstNullIndex, rows.back() + 1) : 0; diff --git a/velox/dwio/common/SelectiveColumnReader.h b/velox/dwio/common/SelectiveColumnReader.h index 821498a409f4..053db8884884 100644 --- a/velox/dwio/common/SelectiveColumnReader.h +++ b/velox/dwio/common/SelectiveColumnReader.h @@ -27,20 +27,20 @@ namespace facebook::velox::dwio::common { -// Generalized representation of a set of distinct values for dictionary -// encodings. +/// Generalized representation of a set of distinct values for dictionary +/// encodings. struct DictionaryValues { - // Array of values for dictionary. StringViews for string values. + /// Array of values for dictionary. StringViews for string values. BufferPtr values; - // For a string dictionary, holds the characters that are pointed to by - // StringViews in 'values'. + /// For a string dictionary, holds the characters that are pointed to by + /// StringViews in 'values'. BufferPtr strings; - // Number of valid elements in 'values'. + /// Number of valid elements in 'values'. int32_t numValues{0}; - // True if values are in ascending order. + /// True if values are in ascending order. bool sorted{false}; void clear() { @@ -78,18 +78,17 @@ struct RawScanState { uint8_t* __restrict filterCache; }; -// Maintains state for encoding between calls to readWithVisitor of -// individual readers. DWRF sets up encoding information at the -// start of a stripe and dictionaries at the start of stripes and -// optionally row groups. Other encodings can set dictionaries and -// encoding types at any time during processing a stripe. +// Maintains state for encoding between calls to readWithVisitor of individual +// readers. DWRF sets up encoding information at the start of a stripe and +// dictionaries at the start of stripes and optionally row groups. Other +// encodings can set dictionaries and encoding types at any time during +// processing a stripe. // -// This is the union of the state elements that the supported -// encodings require for keeping state. This may be augmented when -// adding formats. This is however inlined in the reader superclass -// ad not for example nodeled as a class hierarchy with virtual -// functions because this needs to be trivially and branchlessly -// accessible. +// This is the union of the state elements that the supported encodings require +// for keeping state. This may be augmented when adding formats. This is however +// inlined in the reader superclass and not for example modeled as a class +// hierarchy with virtual functions because this needs to be trivially and +// branchlessly accessible. struct ScanState { // Copies the owned values of 'this' into 'rawState'. void updateRawState(); @@ -168,17 +167,19 @@ class SelectiveColumnReader { // relative to 'offset', so that row 0 is the 'offset'th row from // start of stripe. 'rows' is expected to stay constant // between this and the next call to read. - virtual void - read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) = 0; + virtual void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) = 0; virtual uint64_t skip(uint64_t numValues) { return formatData_->skip(numValues); } - // Extracts the values at 'rows' into '*result'. May rewrite or - // reallocate '*result'. 'rows' must be the same set or a subset of - // 'rows' passed to the last 'read(). - virtual void getValues(RowSet rows, VectorPtr* result) = 0; + /// Extracts the values at 'rows' into '*result'. May rewrite or reallocate + /// '*result'. 'rows' must be the same set or a subset of 'rows' passed to the + /// last 'read(). + virtual void getValues(const RowSet& rows, VectorPtr* result) = 0; // Returns the rows that were selected/visited by the last // read(). If 'this' has no filter, returns 'rows' passed to last @@ -194,10 +195,10 @@ class SelectiveColumnReader { // offset-th from the start of stripe. virtual void seekTo(vector_size_t offset, bool readsNullsOnly); - // Positions this at the start of 'index'th row - // group. Interpretation of 'index' depends on format. Clears counts - // of skipped enclosing struct nulls for formats where nulls are - // recorded at each nesting level, i.e. not rep-def. + /// Positions this at the start of 'index'th row group. Interpretation of + /// 'index' depends on format. Clears counts of skipped enclosing struct nulls + /// for formats where nulls are recorded at each nesting level, i.e. not + /// rep-def. virtual void seekToRowGroup(uint32_t index) { VELOX_TRACE_HISTORY_PUSH("seekToRowGroup %u", index); numParentNulls_ = 0; @@ -212,7 +213,7 @@ class SelectiveColumnReader { return *fileType_; } - // The below functions are called from ColumnVisitor to fill the result set. + /// The below functions are called from ColumnVisitor to fill the result set. inline void addOutputRow(vector_size_t row) { outputRows_.push_back(row); } @@ -246,7 +247,7 @@ class SelectiveColumnReader { uint64_t* mutableNulls(int32_t size) { if (!resultNulls_->unique()) { resultNulls_ = AlignedBuffer::allocate( - numValues_ + size, &memoryPool_, bits::kNotNull); + numValues_ + size, memoryPool_, bits::kNotNull); rawResultNulls_ = resultNulls_->asMutable(); } if (resultNulls_->capacity() * 8 < numValues_ + size) { @@ -272,7 +273,7 @@ class SelectiveColumnReader { return returnReaderNulls_; } - void initReturnReaderNulls(RowSet rows); + void initReturnReaderNulls(const RowSet& rows); void setNumValues(vector_size_t size) { numValues_ = size; @@ -328,12 +329,12 @@ class SelectiveColumnReader { anyNulls_ = true; bits::setNull(rawResultNulls_, numValues_); - // Set the default value at the nominal width of the reader but - // calculate the index based on the actual width of the - // data. These may differ for integer and dictionary readers. - auto valuesAsChar = reinterpret_cast(rawValues_); + // Set the default value at the nominal width of the reader but calculate + // the index based on the actual width of the data. These may differ for + // integer and dictionary readers. + auto* valuesAsChar = reinterpret_cast(rawValues_); *reinterpret_cast(valuesAsChar + valueSize_ * numValues_) = T(); - numValues_++; + ++numValues_; } template @@ -344,7 +345,7 @@ class SelectiveColumnReader { VELOX_DCHECK_NOT_NULL(rawValues_); VELOX_DCHECK_LE((numValues_ + 1) * sizeof(T), values_->capacity()); reinterpret_cast(rawValues_)[numValues_] = value; - numValues_++; + ++numValues_; } void dropResults(vector_size_t count) { @@ -368,8 +369,8 @@ class SelectiveColumnReader { setReadOffset(readOffset); } - // Recursively sets 'isTopLevel_'. Recurses down non-nullable structs, - // otherwise only sets 'isTopLevel_' of 'this' + /// Recursively sets 'isTopLevel_'. Recurses down non-nullable structs, + /// otherwise only sets 'isTopLevel_' of 'this' virtual void setIsTopLevel() { isTopLevel_ = true; } @@ -407,12 +408,11 @@ class SelectiveColumnReader { return nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; } - // Returns true if no filters or deterministic filters/hooks that - // discard nulls. This is used at read prepare time. useFastPath() - // in DecoderUtil.h is used at read time and is expected to produce - // the same result. + /// Returns true if no filters or deterministic filters/hooks that discard + /// nulls. This is used at read prepare time. useFastPath() in DecoderUtil.h + /// is used at read time and is expected to produce the same result. bool useBulkPath() const { - auto filter = scanSpec_->filter(); + auto* filter = scanSpec_->filter(); return hasBulkPath() && process::hasAvx2() && (!filter || (filter->isDeterministic() && @@ -440,16 +440,17 @@ class SelectiveColumnReader { // converts to direct in mid-read. virtual void dedictionarize() {} - // A reader nested inside nullable containers has fewer rows than - // the top level table. addParentNulls records how many parent nulls - // there are between the position of 'this' and 'rows.back() + 1', - // i.e. the position of the scan in top level rows. 'firstRowInNulls' is - // the top level row corresponding to the first bit in - // 'nulls'. 'nulls' is in terms of top level rows and represents all - // null parents at any enclosing level. 'nulls' is nullptr if there are no - // parent nulls. - void - addParentNulls(int32_t firstRowInNulls, const uint64_t* nulls, RowSet rows); + /// A reader nested inside nullable containers has fewer rows than the top + /// level table. addParentNulls records how many parent nulls there are + /// between the position of 'this' and 'rows.back() + 1', i.e. the position of + /// the scan in top level rows. 'firstRowInNulls' is the top level row + /// corresponding to the first bit in 'nulls'. 'nulls' is in terms of top + /// level rows and represents all null parents at any enclosing level. 'nulls' + /// is nullptr if there are no parent nulls. + void addParentNulls( + int32_t firstRowInNulls, + const uint64_t* nulls, + const RowSet& rows); // When skipping rows in a struct, records how many parent nulls at // any level there are between top level row 'from' and 'to'. If @@ -466,16 +467,16 @@ class SelectiveColumnReader { // Prepares the result buffer for nulls for reading 'rows'. Leaves // 'extraSpace' bits worth of space in the nulls buffer. - void prepareNulls(RowSet rows, bool hasNulls, int32_t extraRows = 0); + void prepareNulls(const RowSet& rows, bool hasNulls, int32_t extraRows = 0); void setIsFlatMapValue(bool value) { isFlatMapValue_ = value; } - // Filters 'rows' according to 'is_null'. Only applies to cases where - // scanSpec_->readsNullsOnly() is true. + /// Filters 'rows' according to 'isNull'. Only applies to cases where + /// scanSpec_->readsNullsOnly() is true. template - void filterNulls(RowSet rows, bool isNull, bool extractValues); + void filterNulls(const RowSet& rows, bool isNull, bool extractValues); // Temporary method for estimate total in-memory byte size and row count of // current encoding chunk on this column for Nimble. Will be removed once @@ -506,31 +507,34 @@ class SelectiveColumnReader { protected: template - void - prepareRead(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); + void prepareRead( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls); virtual bool readsNullsOnly() const { return scanSpec_->readsNullsOnly(); } - void setOutputRows(RowSet rows) { + void setOutputRows(const RowSet& rows) { outputRows_.resize(rows.size()); - if (!rows.size()) { + if (rows.empty()) { return; } - memcpy(outputRows_.data(), &rows[0], rows.size() * sizeof(vector_size_t)); + ::memcpy(outputRows_.data(), &rows[0], rows.size() * sizeof(vector_size_t)); } - // Returns integer values for 'rows' cast to the width of - // 'requestedType' in '*result'. - void - getIntValues(RowSet rows, const TypePtr& requestedType, VectorPtr* result); + /// Returns integer values for 'rows' cast to the width of 'requestedType' in + /// '*result'. + void getIntValues( + const RowSet& rows, + const TypePtr& requestedType, + VectorPtr* result); - // Returns integer values for 'rows' cast to the width of - // 'requestedType' in '*result', the related fileDataType is unsigned int - // type. + /// Returns integer values for 'rows' cast to the width of 'requestedType' in + /// '*result', the related fileDataType is unsigned int type. void getUnsignedIntValues( - RowSet rows, + const RowSet& rows, const TypePtr& requestedType, VectorPtr* result); @@ -540,26 +544,26 @@ class SelectiveColumnReader { // to rows. TODO: Consider isFinal as template parameter. template void getFlatValues( - RowSet rows, + const RowSet& rows, VectorPtr* result, const TypePtr& type, bool isFinal = false); template - void compactScalarValues(RowSet rows, bool isFinal); + void compactScalarValues(const RowSet& rows, bool isFinal); template - void upcastScalarValues(RowSet rows); + void upcastScalarValues(const RowSet& rows); // For complex type column, we need to compact only nulls if the rows are // shrinked. Child fields are handled recursively in their own column // readers. - void setComplexNulls(RowSet rows, VectorPtr& result) const; + void setComplexNulls(const RowSet& rows, VectorPtr& result) const; // Return the source null bits if compactScalarValues and upcastScalarValues // should move null flags. Return nullptr if nulls does not need to be moved. // Checks consistency of nulls-related state. - const uint64_t* shouldMoveNulls(RowSet rows); + const uint64_t* shouldMoveNulls(const RowSet& rows); void addStringValue(folly::StringPiece value); @@ -575,8 +579,8 @@ class SelectiveColumnReader { void decodeWithVisitor( IntDecoder* intDecoder, ColumnVisitor& visitor) { - auto decoder = dynamic_cast(intDecoder); - VELOX_CHECK( + auto* decoder = dynamic_cast(intDecoder); + VELOX_CHECK_NOT_NULL( decoder, "Unexpected Decoder type, Expected: {}", typeid(Decoder).name()); @@ -596,21 +600,21 @@ class SelectiveColumnReader { : resultNulls_; } - memory::MemoryPool& memoryPool_; + memory::MemoryPool* const memoryPool_; // The requested data type - TypePtr requestedType_; + const TypePtr requestedType_; // The file data type - std::shared_ptr fileType_; + const std::shared_ptr fileType_; // Format specific state and functions. - std::unique_ptr formatData_; + const std::unique_ptr formatData_; // Specification of filters, value extraction, pruning etc. The // spec is assigned at construction and the contents may change at // run time based on adaptation. Owned by caller. - velox::common::ScanSpec* scanSpec_; + velox::common::ScanSpec* const scanSpec_; // Row number after last read row, relative to the ORC stripe or Parquet // Rowgroup start. @@ -628,12 +632,12 @@ class SelectiveColumnReader { // The rows to process in read(). References memory supplied by // caller. The values must remain live until the next call to read(). RowSet inputRows_; - // Rows passing the filter in readWithVisitor. Must stay - // constant between consecutive calls to read(). + // Rows passing the filter in readWithVisitor. Must stay constant between + // consecutive calls to read(). raw_vector outputRows_; - // The row number - // corresponding to each element in 'values_' + // The row number corresponding to each element in 'values_' raw_vector valueRows_; + // The set of all nulls in the range of read(). Created when first // needed and then reused. May be referenced by result if all rows are // selected. @@ -655,11 +659,10 @@ class SelectiveColumnReader { // true if 'this' is in a state where gatValues can be called. bool mayGetValues_ = false; - // True if row numbers of 'this' correspond 1:1 to row numbers in - // the file. This is false inside lists, maps and nullable - // structs. If true, a skip of n rows can use row group indices to - // skip long distances. Lazy vectors will only be made for results - // of top level readers. + // True if row numbers of 'this' correspond 1:1 to row numbers in the file. + // This is false inside lists, maps and nullable structs. If true, a skip of n + // rows can use row group indices to skip long distances. Lazy vectors will + // only be made for results of top level readers. bool isTopLevel_{false}; // Maps from position in non-null rows to a position in value @@ -675,8 +678,8 @@ class SelectiveColumnReader { // True if a vector can acquire a pin to a stream's buffer and refer // to that as its values. bool mayUseStreamBuffer_ = false; - // True if nulls and everything selected, so that nullsInReadRange - // can be returned as the null flags of the vector in getValues(). + // True if nulls and everything selected, so that nullsInReadRange can be + // returned as the null flags of the vector in getValues(). bool returnReaderNulls_ = false; // Total writable bytes in 'rawStringBuffer_'. int32_t rawStringSize_ = 0; diff --git a/velox/dwio/common/SelectiveColumnReaderInternal.h b/velox/dwio/common/SelectiveColumnReaderInternal.h index 64ba91abec23..689f54b5e818 100644 --- a/velox/dwio/common/SelectiveColumnReaderInternal.h +++ b/velox/dwio/common/SelectiveColumnReaderInternal.h @@ -39,19 +39,19 @@ void SelectiveColumnReader::ensureValuesCapacity(vector_size_t numRows) { return; } values_ = AlignedBuffer::allocate( - numRows + (simd::kPadding / sizeof(T)), &memoryPool_); + numRows + (simd::kPadding / sizeof(T)), memoryPool_); rawValues_ = values_->asMutable(); } template void SelectiveColumnReader::prepareRead( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { const bool readsNullsOnly = this->readsNullsOnly(); seekTo(offset, readsNullsOnly); - vector_size_t numRows = rows.back() + 1; + const vector_size_t numRows = rows.back() + 1; if (isFlatMapValue_) { if (!nullsInReadRange_) { nullsInReadRange_ = std::move(flatMapValueNullsInReadRange_); @@ -59,11 +59,13 @@ void SelectiveColumnReader::prepareRead( } else if (nullsInReadRange_ && !nullsInReadRange_->unique()) { nullsInReadRange_.reset(); } + formatData_->readNulls( numRows, incomingNulls, nullsInReadRange_, readsNullsOnly); if (isFlatMapValue_ && nullsInReadRange_) { flatMapValueNullsInReadRange_ = nullsInReadRange_; } + // We check for all nulls and no nulls. We expect both calls to // bits::isAllSet to fail early in the common case. We could do a // single traversal of null bits counting the bits and then compare @@ -78,10 +80,11 @@ void SelectiveColumnReader::prepareRead( nullsInReadRange_->as(), 0, numRows, bits::kNotNull)) { nullsInReadRange_ = nullptr; } + innerNonNullRows_.clear(); outerNonNullRows_.clear(); outputRows_.clear(); - // is part of read() and after read returns getValues may be called. + // Is part of read() and after read returns getValues may be called. mayGetValues_ = true; numValues_ = 0; valueSize_ = sizeof(T); @@ -89,6 +92,7 @@ void SelectiveColumnReader::prepareRead( if (scanSpec_->filter() || hasDeletion()) { outputRows_.reserve(rows.size()); } + ensureValuesCapacity(rows.size()); if (scanSpec_->keepValues() && !scanSpec_->valueHook()) { valueRows_.clear(); @@ -98,7 +102,7 @@ void SelectiveColumnReader::prepareRead( template void SelectiveColumnReader::getFlatValues( - RowSet rows, + const RowSet& rows, VectorPtr* result, const TypePtr& type, bool isFinal) { @@ -107,6 +111,7 @@ void SelectiveColumnReader::getFlatValues( if (isFinal) { mayGetValues_ = false; } + if (allNull_) { if (isFlatMapValue_) { if (flatMapValueConstantNullValues_) { @@ -114,15 +119,16 @@ void SelectiveColumnReader::getFlatValues( } else { flatMapValueConstantNullValues_ = std::make_shared>( - &memoryPool_, rows.size(), true, type, T()); + memoryPool_, rows.size(), true, type, T()); } *result = flatMapValueConstantNullValues_; } else { *result = std::make_shared>( - &memoryPool_, rows.size(), true, type, T()); + memoryPool_, rows.size(), true, type, T()); } return; } + if (valueSize_ == sizeof(TVector)) { compactScalarValues(rows, isFinal); } else if (sizeof(T) >= sizeof(TVector)) { @@ -140,7 +146,7 @@ void SelectiveColumnReader::getFlatValues( flat->setStringBuffers(std::move(stringBuffers_)); } else { flatMapValueFlatValues_ = std::make_shared>( - &memoryPool_, + memoryPool_, type, resultNulls(), numValues_, @@ -150,7 +156,7 @@ void SelectiveColumnReader::getFlatValues( *result = flatMapValueFlatValues_; } else { *result = std::make_shared>( - &memoryPool_, + memoryPool_, type, resultNulls(), numValues_, @@ -161,13 +167,13 @@ void SelectiveColumnReader::getFlatValues( template <> void SelectiveColumnReader::getFlatValues( - RowSet rows, + const RowSet& rows, VectorPtr* result, const TypePtr& type, bool isFinal); template -void SelectiveColumnReader::upcastScalarValues(RowSet rows) { +void SelectiveColumnReader::upcastScalarValues(const RowSet& rows) { VELOX_CHECK_LE(rows.size(), numValues_); VELOX_CHECK(!rows.empty()); if (!values_) { @@ -222,7 +228,9 @@ void SelectiveColumnReader::upcastScalarValues(RowSet rows) { } template -void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { +void SelectiveColumnReader::compactScalarValues( + const RowSet& rows, + bool isFinal) { VELOX_CHECK_LE(rows.size(), numValues_); VELOX_CHECK(!rows.empty()); if (!values_ || (rows.size() == numValues_ && sizeof(T) == sizeof(TVector))) { @@ -231,6 +239,7 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { } return; } + VELOX_CHECK_LE(sizeof(TVector), sizeof(T)); T* typedSourceValues = reinterpret_cast(rawValues_); TVector* typedDestValues = reinterpret_cast(rawValues_); @@ -249,15 +258,16 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { if (valueRows_.empty()) { valueRows_.resize(rows.size()); } + vector_size_t rowIndex = 0; auto nextRow = rows[rowIndex]; - auto* moveNullsFrom = shouldMoveNulls(rows); - for (size_t i = 0; i < numValues_; i++) { + const auto* moveNullsFrom = shouldMoveNulls(rows); + for (size_t i = 0; i < numValues_; ++i) { if (sourceRows[i] < nextRow) { continue; } - VELOX_DCHECK(sourceRows[i] == nextRow); + VELOX_DCHECK_EQ(sourceRows[i], nextRow); typedDestValues[rowIndex] = typedSourceValues[i]; if (moveNullsFrom && rowIndex != i) { bits::setBit(rawResultNulls_, rowIndex, bits::isBitSet(moveNullsFrom, i)); @@ -265,12 +275,13 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { if (!isFinal) { valueRows_[rowIndex] = nextRow; } - rowIndex++; + ++rowIndex; if (rowIndex >= rows.size()) { break; } nextRow = rows[rowIndex]; } + numValues_ = rows.size(); valueRows_.resize(numValues_); values_->setSize(numValues_ * sizeof(TVector)); @@ -278,7 +289,7 @@ void SelectiveColumnReader::compactScalarValues(RowSet rows, bool isFinal) { template <> void SelectiveColumnReader::compactScalarValues( - RowSet rows, + const RowSet& rows, bool isFinal); inline int32_t sizeOfIntKind(TypeKind kind) { @@ -290,20 +301,20 @@ inline int32_t sizeOfIntKind(TypeKind kind) { case TypeKind::BIGINT: return 8; default: - VELOX_FAIL("Not an integer TypeKind"); + VELOX_FAIL("Not an integer TypeKind: {}", static_cast(kind)); } } template void SelectiveColumnReader::filterNulls( - RowSet rows, + const RowSet& rows, bool isNull, bool extractValues) { - bool isDense = rows.back() == rows.size() - 1; + const bool isDense = rows.back() == rows.size() - 1; // We decide is (not) null based on 'nullsInReadRange_'. This may be // set due to nulls in enclosing structs even if the column itself // does not add nulls. - auto rawNulls = + auto* rawNulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; if (isNull) { if (!rawNulls) { @@ -326,7 +337,6 @@ void SelectiveColumnReader::filterNulls( } } } - return; } diff --git a/velox/dwio/common/SelectiveFloatingPointColumnReader.h b/velox/dwio/common/SelectiveFloatingPointColumnReader.h index 8e354d5ee6e8..02604b3eff64 100644 --- a/velox/dwio/common/SelectiveFloatingPointColumnReader.h +++ b/velox/dwio/common/SelectiveFloatingPointColumnReader.h @@ -41,10 +41,12 @@ class SelectiveFloatingPointColumnReader : public SelectiveColumnReader { } template - void - readCommon(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls); + void readCommon( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls); - void getValues(RowSet rows, VectorPtr* result) override { + void getValues(const RowSet& rows, VectorPtr* result) override { getFlatValues(rows, result, requestedType_); } @@ -171,7 +173,7 @@ template template void SelectiveFloatingPointColumnReader::readCommon( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); bool isDense = rows.back() == rows.size() - 1; diff --git a/velox/dwio/common/SelectiveIntegerColumnReader.h b/velox/dwio/common/SelectiveIntegerColumnReader.h index 73ac0b5ab152..7265e32a7921 100644 --- a/velox/dwio/common/SelectiveIntegerColumnReader.h +++ b/velox/dwio/common/SelectiveIntegerColumnReader.h @@ -20,8 +20,8 @@ namespace facebook::velox::dwio::common { -// Abstract class for format and encoding-independent parts of reading ingeger -// columns. +/// Abstract class for format and encoding-independent parts of reading ingeger +/// columns. class SelectiveIntegerColumnReader : public SelectiveColumnReader { public: SelectiveIntegerColumnReader( @@ -35,7 +35,7 @@ class SelectiveIntegerColumnReader : public SelectiveColumnReader { params, scanSpec) {} - void getValues(RowSet rows, VectorPtr* result) override { + void getValues(const RowSet& rows, VectorPtr* result) override { getIntValues(rows, requestedType_, result); } @@ -48,13 +48,13 @@ class SelectiveIntegerColumnReader : public SelectiveColumnReader { typename ExtractValues> void processFilter( velox::common::Filter* filter, - ExtractValues extractValues, - RowSet rows); + const ExtractValues& extractValues, + const RowSet& rows); // Switches based on the type of ValueHook between different readWithVisitor // instantiations. template - void processValueHook(RowSet rows, ValueHook* hook); + void processValueHook(const RowSet& rows, ValueHook* hook); // Instantiates a Visitor based on type, isDense, value processing. template < @@ -64,14 +64,14 @@ class SelectiveIntegerColumnReader : public SelectiveColumnReader { typename ExtractValues> void readHelper( velox::common::Filter* filter, - RowSet rows, - ExtractValues extractValues); + const RowSet& rows, + const ExtractValues& extractValues); // The common part of integer reading. calls the appropriate // instantiation of processValueHook or processFilter based on // possible value hook, filter and denseness. template - void readCommon(RowSet rows); + void readCommon(const RowSet& rows); }; template < @@ -81,8 +81,8 @@ template < typename ExtractValues> void SelectiveIntegerColumnReader::readHelper( velox::common::Filter* filter, - RowSet rows, - ExtractValues extractValues) { + const RowSet& rows, + const ExtractValues& extractValues) { switch (valueSize_) { case 2: reinterpret_cast(this)->Reader::readWithVisitor( @@ -124,8 +124,8 @@ template < typename ExtractValues> void SelectiveIntegerColumnReader::processFilter( velox::common::Filter* filter, - ExtractValues extractValues, - RowSet rows) { + const ExtractValues& extractValues, + const RowSet& rows) { if (filter == nullptr) { readHelper( &dwio::common::alwaysTrue(), rows, extractValues); @@ -193,7 +193,7 @@ void SelectiveIntegerColumnReader::processFilter( template void SelectiveIntegerColumnReader::processValueHook( - RowSet rows, + const RowSet& rows, ValueHook* hook) { switch (hook->kind()) { case aggregate::AggregationHook::kBigintSum: @@ -227,8 +227,8 @@ void SelectiveIntegerColumnReader::processValueHook( } template -void SelectiveIntegerColumnReader::readCommon(RowSet rows) { - bool isDense = rows.back() == rows.size() - 1; +void SelectiveIntegerColumnReader::readCommon(const RowSet& rows) { + const bool isDense = rows.back() == rows.size() - 1; velox::common::Filter* filter = scanSpec_->filter() ? scanSpec_->filter() : &alwaysTrue(); if (scanSpec_->keepValues()) { diff --git a/velox/dwio/common/SelectiveRepeatedColumnReader.cpp b/velox/dwio/common/SelectiveRepeatedColumnReader.cpp index 4b7713433d6d..ce2a4a002eab 100644 --- a/velox/dwio/common/SelectiveRepeatedColumnReader.cpp +++ b/velox/dwio/common/SelectiveRepeatedColumnReader.cpp @@ -89,18 +89,18 @@ advanceNestedRows(const RowSet& rows, vector_size_t i, vector_size_t last) { } // namespace void SelectiveRepeatedColumnReader::makeNestedRowSet( - RowSet rows, + const RowSet& rows, int32_t maxRow) { if (!allLengthsHolder_ || allLengthsHolder_->capacity() < (maxRow + 1) * sizeof(vector_size_t)) { - allLengthsHolder_ = allocateIndices(maxRow + 1, &memoryPool_); + allLengthsHolder_ = allocateIndices(maxRow + 1, memoryPool_); allLengths_ = allLengthsHolder_->asMutable(); } - auto nulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; + auto* nulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; // Reads the lengths, leaves an uninitialized gap for a null // map/list. Reading these checks the null mask. readLengths(allLengths_, maxRow + 1, nulls); - vector_size_t nestedLength = 0; + vector_size_t nestedLength{0}; for (auto row : rows) { if (!nulls || !bits::isBitNull(nulls, row)) { nestedLength += @@ -108,11 +108,12 @@ void SelectiveRepeatedColumnReader::makeNestedRowSet( } } nestedRowsHolder_.resize(nestedLength); + vector_size_t currentRow = 0; vector_size_t nestedRow = 0; vector_size_t nestedOffset = 0; for (auto rowIndex = 0; rowIndex < rows.size(); ++rowIndex) { - auto row = rows[rowIndex]; + const auto row = rows[rowIndex]; // Add up the lengths of non-null rows skipped since the last // non-null. nestedOffset += sumLengths(allLengths_, nulls, currentRow, row); @@ -120,7 +121,7 @@ void SelectiveRepeatedColumnReader::makeNestedRowSet( if (nulls && bits::isBitNull(nulls, row)) { continue; } - auto lengthAtRow = + const auto lengthAtRow = std::min(scanSpec_->maxArrayElementsCount(), allLengths_[row]); std::iota( nestedRowsHolder_.data() + nestedRow, @@ -135,7 +136,7 @@ void SelectiveRepeatedColumnReader::makeNestedRowSet( } void SelectiveRepeatedColumnReader::makeOffsetsAndSizes( - RowSet rows, + const RowSet& rows, ArrayVectorBase& result) { auto* rawOffsets = result.mutableOffsets(rows.size())->asMutable(); @@ -145,7 +146,7 @@ void SelectiveRepeatedColumnReader::makeOffsetsAndSizes( vector_size_t currentOffset = 0; vector_size_t nestedRowIndex = 0; for (int i = 0; i < rows.size(); ++i) { - auto row = rows[i]; + const auto row = rows[i]; currentOffset += sumLengths(allLengths_, nulls, currentRow, row); currentRow = row + 1; nestedRowIndex = @@ -156,7 +157,7 @@ void SelectiveRepeatedColumnReader::makeOffsetsAndSizes( anyNulls_ = true; } else { currentOffset += allLengths_[row]; - auto newNestedRowIndex = + const auto newNestedRowIndex = advanceNestedRows(nestedRows_, nestedRowIndex, currentOffset); rawSizes[i] = newNestedRowIndex - nestedRowIndex; nestedRowIndex = newNestedRowIndex; @@ -165,7 +166,7 @@ void SelectiveRepeatedColumnReader::makeOffsetsAndSizes( numValues_ = rows.size(); } -RowSet SelectiveRepeatedColumnReader::applyFilter(RowSet rows) { +RowSet SelectiveRepeatedColumnReader::applyFilter(const RowSet& rows) { if (!scanSpec_->filter()) { return rows; } @@ -218,7 +219,7 @@ uint64_t SelectiveListColumnReader::skip(uint64_t numValues) { void SelectiveListColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { // Catch up if the child is behind the length stream. child_->seekTo(childTargetReadOffset_, false); @@ -232,9 +233,11 @@ void SelectiveListColumnReader::read( readOffset_ = offset + rows.back() + 1; } -void SelectiveListColumnReader::getValues(RowSet rows, VectorPtr* result) { +void SelectiveListColumnReader::getValues( + const RowSet& rows, + VectorPtr* result) { VELOX_DCHECK_NOT_NULL(result); - prepareResult(*result, requestedType_, rows.size(), &memoryPool_); + prepareResult(*result, requestedType_, rows.size(), memoryPool_); auto* resultArray = result->get()->asUnchecked(); makeOffsetsAndSizes(rows, *resultArray); setComplexNulls(rows, *result); @@ -257,10 +260,10 @@ uint64_t SelectiveMapColumnReader::skip(uint64_t numValues) { numValues = formatData_->skipNulls(numValues); if (keyReader_ || elementReader_) { std::array buffer; - uint64_t childElements = 0; - uint64_t lengthsRead = 0; + uint64_t childElements{0}; + uint64_t lengthsRead{0}; while (lengthsRead < numValues) { - uint64_t chunk = + const uint64_t chunk = std::min(numValues - lengthsRead, static_cast(kBufferSize)); readLengths(buffer.data(), chunk, nullptr); for (size_t i = 0; i < chunk; ++i) { @@ -268,6 +271,7 @@ uint64_t SelectiveMapColumnReader::skip(uint64_t numValues) { } lengthsRead += chunk; } + if (keyReader_) { keyReader_->seekTo(keyReader_->readOffset() + childElements, false); } @@ -276,7 +280,6 @@ uint64_t SelectiveMapColumnReader::skip(uint64_t numValues) { elementReader_->readOffset() + childElements, false); } childTargetReadOffset_ += childElements; - } else { VELOX_FAIL("repeated reader with no children"); } @@ -285,7 +288,7 @@ uint64_t SelectiveMapColumnReader::skip(uint64_t numValues) { void SelectiveMapColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { // Catch up if child readers are behind the length stream. if (keyReader_) { @@ -296,7 +299,7 @@ void SelectiveMapColumnReader::read( } prepareRead(offset, rows, incomingNulls); - auto activeRows = applyFilter(rows); + const auto activeRows = applyFilter(rows); makeNestedRowSet(activeRows, rows.back()); if (keyReader_ && elementReader_ && !nestedRows_.empty()) { keyReader_->read(keyReader_->readOffset(), nestedRows_, nullptr); @@ -309,13 +312,15 @@ void SelectiveMapColumnReader::read( readOffset_ = offset + rows.back() + 1; } -void SelectiveMapColumnReader::getValues(RowSet rows, VectorPtr* result) { +void SelectiveMapColumnReader::getValues( + const RowSet& rows, + VectorPtr* result) { VELOX_DCHECK_NOT_NULL(result); VELOX_CHECK( !result->get() || result->get()->type()->isMap(), "Expect MAP result vector, got {}", result->get()->type()->toString()); - prepareResult(*result, requestedType_, rows.size(), &memoryPool_); + prepareResult(*result, requestedType_, rows.size(), memoryPool_); auto* resultMap = result->get()->asUnchecked(); makeOffsetsAndSizes(rows, *resultMap); setComplexNulls(rows, *result); diff --git a/velox/dwio/common/SelectiveRepeatedColumnReader.h b/velox/dwio/common/SelectiveRepeatedColumnReader.h index 1d486acb4718..c5a60d8b5e6c 100644 --- a/velox/dwio/common/SelectiveRepeatedColumnReader.h +++ b/velox/dwio/common/SelectiveRepeatedColumnReader.h @@ -51,23 +51,23 @@ class SelectiveRepeatedColumnReader : public SelectiveColumnReader { virtual void readLengths(int32_t* lengths, int32_t numLengths, const uint64_t* nulls) = 0; - // Create row set for child columns based on the row set of parent column. - void makeNestedRowSet(RowSet rows, int32_t maxRow); + /// Create row set for child columns based on the row set of parent column. + void makeNestedRowSet(const RowSet& rows, int32_t maxRow); - // Compute the offsets and lengths based on the current filtered rows passed - // in. - void makeOffsetsAndSizes(RowSet rows, ArrayVectorBase&); + /// Compute the offsets and lengths based on the current filtered rows passed + /// in. + void makeOffsetsAndSizes(const RowSet& rows, ArrayVectorBase&); - // Creates a struct if '*result' is empty and 'type' is a row. + /// Creates a struct if '*result' is empty and 'type' is a row. void prepareStructResult(const TypePtr& type, VectorPtr* result) { if (!*result && type->kind() == TypeKind::ROW) { - *result = BaseVector::create(type, 0, &memoryPool_); + *result = BaseVector::create(type, 0, memoryPool_); } } // Apply filter on parent level. Child filtering should be handled separately // in subclasses. - RowSet applyFilter(RowSet rows); + RowSet applyFilter(const RowSet& rows); BufferPtr allLengthsHolder_; vector_size_t* allLengths_; @@ -96,10 +96,12 @@ class SelectiveListColumnReader : public SelectiveRepeatedColumnReader { uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; protected: std::unique_ptr child_; @@ -120,11 +122,14 @@ class SelectiveMapColumnReader : public SelectiveRepeatedColumnReader { uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; + protected: std::unique_ptr keyReader_; std::unique_ptr elementReader_; }; diff --git a/velox/dwio/common/SelectiveStructColumnReader.cpp b/velox/dwio/common/SelectiveStructColumnReader.cpp index 7df3242bae98..cdd867065566 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.cpp +++ b/velox/dwio/common/SelectiveStructColumnReader.cpp @@ -57,16 +57,17 @@ void SelectiveStructColumnReaderBase::fillOutputRowsFromMutation( vector_size_t size) { if (mutation_->deletedRows) { bits::forEachUnsetBit(mutation_->deletedRows, 0, size, [&](auto i) { - if (!mutation_->randomSkip || mutation_->randomSkip->testOne()) { + if ((mutation_->randomSkip == nullptr) || + mutation_->randomSkip->testOne()) { addOutputRow(i); } }); } else { - VELOX_CHECK(mutation_->randomSkip); + VELOX_CHECK_NOT_NULL(mutation_->randomSkip); vector_size_t i = 0; while (i < size) { - auto skip = mutation_->randomSkip->nextSkip(); - auto remaining = size - i; + const auto skip = mutation_->randomSkip->nextSkip(); + const auto remaining = size - i; if (skip >= remaining) { mutation_->randomSkip->consume(remaining); break; @@ -113,14 +114,14 @@ void SelectiveStructColumnReaderBase::next( } } } - for (auto& childSpec : scanSpec_->children()) { + for (const auto& childSpec : scanSpec_->children()) { if (isChildConstant(*childSpec) && !testFilterOnConstant(*childSpec)) { numValues = 0; break; } } - // no readers + // No readers // This can be either count(*) query or a query that select only // constant columns (partition keys or columns missing from an old file // due to schema evolution) @@ -130,14 +131,15 @@ void SelectiveStructColumnReaderBase::next( for (auto& childSpec : scanSpec_->children()) { VELOX_CHECK(childSpec->isConstant()); if (childSpec->projectOut()) { - auto channel = childSpec->channel(); + const auto channel = childSpec->channel(); resultRowVector->childAt(channel) = BaseVector::wrapInConstant( numValues, 0, childSpec->constantValue()); } } return; } - auto oldSize = rows_.size(); + + const auto oldSize = rows_.size(); rows_.resize(numValues); if (numValues > oldSize) { std::iota(&rows_[oldSize], &rows_[rows_.size()], oldSize); @@ -148,7 +150,7 @@ void SelectiveStructColumnReaderBase::next( void SelectiveStructColumnReaderBase::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { numReads_ = scanSpec_->newRead(); prepareRead(offset, rows, incomingNulls); @@ -156,7 +158,7 @@ void SelectiveStructColumnReaderBase::read( if (hasDeletion_) { // We handle the mutation after prepareRead so that output rows and format // specific initializations (e.g. RepDef in Parquet) are done properly. - VELOX_DCHECK(!nullsInReadRange_, "Only top level can have mutation"); + VELOX_DCHECK_NULL(nullsInReadRange_, "Only top level can have mutation"); VELOX_DCHECK_EQ( rows.back(), rows.size() - 1, "Top level should have a dense row set"); fillOutputRowsFromMutation(rows.size()); @@ -166,11 +168,12 @@ void SelectiveStructColumnReaderBase::read( } activeRows = outputRows_; } + const uint64_t* structNulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; - // a struct reader may have a null/non-null filter + // A struct reader may have a null/non-null filter if (scanSpec_->filter()) { - auto kind = scanSpec_->filter()->kind(); + const auto kind = scanSpec_->filter()->kind(); VELOX_CHECK( kind == velox::common::FilterKind::kIsNull || kind == velox::common::FilterKind::kIsNotNull); @@ -185,10 +188,10 @@ void SelectiveStructColumnReaderBase::read( activeRows = outputRows_; } - auto& childSpecs = scanSpec_->children(); + const auto& childSpecs = scanSpec_->children(); VELOX_CHECK(!childSpecs.empty()); for (size_t i = 0; i < childSpecs.size(); ++i) { - auto& childSpec = childSpecs[i]; + const auto& childSpec = childSpecs[i]; VELOX_TRACE_HISTORY_PUSH("read %s", childSpec->fieldName().c_str()); if (isChildConstant(*childSpec)) { if (!testFilterOnConstant(*childSpec)) { @@ -197,13 +200,15 @@ void SelectiveStructColumnReaderBase::read( } continue; } - auto fieldIndex = childSpec->subscript(); - auto reader = children_.at(fieldIndex); + + const auto fieldIndex = childSpec->subscript(); + auto* reader = children_.at(fieldIndex); if (reader->isTopLevel() && childSpec->projectOut() && !childSpec->hasFilter() && !childSpec->extractValues()) { // Will make a LazyVector. continue; } + advanceFieldReader(reader, offset); if (childSpec->hasFilter()) { { @@ -239,18 +244,18 @@ void SelectiveStructColumnReaderBase::read( void SelectiveStructColumnReaderBase::recordParentNullsInChildren( vector_size_t offset, - RowSet rows) { + const RowSet& rows) { if (formatData_->parentNullsInLeaves()) { return; } - auto& childSpecs = scanSpec_->children(); + const auto& childSpecs = scanSpec_->children(); for (auto i = 0; i < childSpecs.size(); ++i) { - auto& childSpec = childSpecs[i]; + const auto& childSpec = childSpecs[i]; if (isChildConstant(*childSpec)) { continue; } - auto fieldIndex = childSpec->subscript(); - auto reader = children_.at(fieldIndex); + const auto fieldIndex = childSpec->subscript(); + auto* reader = children_.at(fieldIndex); reader->addParentNulls( offset, nullsInReadRange_ ? nullsInReadRange_->as() : nullptr, @@ -358,7 +363,7 @@ void setNullField( } // namespace void SelectiveStructColumnReaderBase::getValues( - RowSet rows, + const RowSet& rows, VectorPtr* result) { VELOX_CHECK(!scanSpec_->children().empty()); VELOX_CHECK_NOT_NULL( @@ -381,37 +386,43 @@ void SelectiveStructColumnReaderBase::getValues( 0, std::move(children)); } + auto* resultRow = static_cast(result->get()); resultRow->unsafeResize(rows.size()); - if (!rows.size()) { + if (rows.empty()) { return; } + setComplexNulls(rows, *result); bool lazyPrepared = false; - for (auto& childSpec : scanSpec_->children()) { + for (const auto& childSpec : scanSpec_->children()) { VELOX_TRACE_HISTORY_PUSH("getValues %s", childSpec->fieldName().c_str()); if (!childSpec->projectOut()) { continue; } - auto channel = childSpec->channel(); + + const auto channel = childSpec->channel(); auto& childResult = resultRow->childAt(channel); if (childSpec->isConstant()) { setConstantField(childSpec->constantValue(), rows.size(), childResult); continue; } - auto index = childSpec->subscript(); + + const auto index = childSpec->subscript(); // Set missing fields to be null constant, if we're in the top level struct // missing columns should already be a null constant from the check above. if (index == kConstantChildSpecSubscript) { - auto& childType = rowType.childAt(channel); + const auto& childType = rowType.childAt(channel); setNullField(rows.size(), childResult, childType, resultRow->pool()); continue; } + if (childSpec->extractValues() || childSpec->hasFilter() || !children_[index]->isTopLevel()) { children_[index]->getValues(rows, &childResult); continue; } + // LazyVector result. if (!lazyPrepared) { if (rows.size() != outputRows_.size()) { @@ -419,17 +430,17 @@ void SelectiveStructColumnReaderBase::getValues( } lazyPrepared = true; } - auto loader = + auto lazyLoader = std::make_unique(this, children_[index], numReads_); if (childResult && childResult->isLazy() && childResult.unique()) { static_cast(*childResult) - .reset(std::move(loader), rows.size()); + .reset(std::move(lazyLoader), rows.size()); } else { childResult = std::make_shared( - &memoryPool_, + memoryPool_, resultRow->type()->childAt(channel), rows.size(), - std::move(loader), + std::move(lazyLoader), std::move(childResult)); } } diff --git a/velox/dwio/common/SelectiveStructColumnReader.h b/velox/dwio/common/SelectiveStructColumnReader.h index be7efd1045f3..6f15b321391c 100644 --- a/velox/dwio/common/SelectiveStructColumnReader.h +++ b/velox/dwio/common/SelectiveStructColumnReader.h @@ -40,10 +40,12 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { const dwio::common::StatsContext& context, FormatData::FilterRowGroupsResult&) const override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; uint64_t numReads() const { return numReads_; @@ -121,11 +123,11 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { getExceptionContext().message(VeloxException::Type::kSystem)), isRoot_(isRoot) {} - // Records the number of nulls added by 'this' between the end - // position of each child reader and the end of the range of - // 'read(). This must be done also if a child is not read so that we - // know how much to skip when seeking forward within the row group. - void recordParentNullsInChildren(vector_size_t offset, RowSet rows); + /// Records the number of nulls added by 'this' between the end position of + /// each child reader and the end of the range of 'read(). This must be done + /// also if a child is not read so that we know how much to skip when seeking + /// forward within the row group. + void recordParentNullsInChildren(vector_size_t offset, const RowSet& rows); bool hasDeletion() const final { return hasDeletion_; @@ -137,6 +139,17 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { void fillOutputRowsFromMutation(vector_size_t size); + // Context information obtained from ExceptionContext. Stored here + // so that LazyVector readers under this can add this to their + // ExceptionContext. Allows contextualizing reader errors to split + // and query. Set at construction, which takes place on first + // use. If no ExceptionContext is in effect, this is "". + const std::string debugString_; + + // Whether or not this is the root Struct that represents entire rows of the + // table. + const bool isRoot_; + std::vector children_; // Sequence number of output batch. Checked against ColumnLoaders @@ -155,20 +168,10 @@ class SelectiveStructColumnReaderBase : public SelectiveColumnReader { bool hasDeletion_ = false; bool fillMutatedOutputRows_ = false; - - // Context information obtained from ExceptionContext. Stored here - // so that LazyVector readers under this can add this to their - // ExceptionContext. Allows contextualizing reader errors to split - // and query. Set at construction, which takes place on first - // use. If no ExceptionContext is in effect, this is "". - const std::string debugString_; - - // Whether or not this is the root Struct that represents entire rows of the - // table. - const bool isRoot_; }; -struct SelectiveStructColumnReader : SelectiveStructColumnReaderBase { +class SelectiveStructColumnReader : public SelectiveStructColumnReaderBase { + public: using SelectiveStructColumnReaderBase::SelectiveStructColumnReaderBase; void addChild(std::unique_ptr child) { @@ -208,7 +211,7 @@ class SelectiveFlatMapColumnReaderHelper { reader_.children_[i]->setIsFlatMapValue(true); } if (auto type = reader_.requestedType_->childAt(1); type->isRow()) { - childValues_ = BaseVector::create(type, 0, &reader_.memoryPool_); + childValues_ = BaseVector::create(type, 0, reader_.memoryPool_); } } @@ -224,8 +227,8 @@ class SelectiveFlatMapColumnReaderHelper { result->resize(size); } else { VLOG(1) << "Reallocating result MAP vector of size " << size; - result = BaseVector::create( - reader_.requestedType_, size, &reader_.memoryPool_); + result = + BaseVector::create(reader_.requestedType_, size, reader_.memoryPool_); } return *result->asUnchecked(); } @@ -459,8 +462,7 @@ void SelectiveFlatMapColumnReaderHelper::copyValues( } } if (strKeySize > 0) { - auto buf = - AlignedBuffer::allocate(strKeySize, &reader_.memoryPool_); + auto buf = AlignedBuffer::allocate(strKeySize, reader_.memoryPool_); rawStrKeyBuffer = buf->template asMutable(); flatKeys->addStringBuffer(buf); strKeySize = 0; diff --git a/velox/dwio/common/TypeUtils.cpp b/velox/dwio/common/TypeUtils.cpp index 50d8b4116383..29e22046196f 100644 --- a/velox/dwio/common/TypeUtils.cpp +++ b/velox/dwio/common/TypeUtils.cpp @@ -139,7 +139,7 @@ void checkTypeCompatibility( } if (recurse) { - uint64_t childCount = std::min(from.size(), to.size()); + const uint64_t childCount = std::min(from.size(), to.size()); for (uint64_t i = 0; i < childCount; ++i) { checkTypeCompatibility( *from.childAt(i), diff --git a/velox/dwio/common/UnitLoader.h b/velox/dwio/common/UnitLoader.h index c1594b5a6333..d3125dacc4be 100644 --- a/velox/dwio/common/UnitLoader.h +++ b/velox/dwio/common/UnitLoader.h @@ -26,16 +26,16 @@ class LoadUnit { public: virtual ~LoadUnit() = default; - // Perform the IO (read) + /// Perform the IO (read) virtual void load() = 0; - // Unload the unit to free memory + /// Unload the unit to free memory virtual void unload() = 0; - // Number of rows in the unit + /// Number of rows in the unit virtual uint64_t getNumRows() = 0; - // Number of bytes that the IO will read + /// Number of bytes that the IO will read virtual uint64_t getIoSize() = 0; }; @@ -43,18 +43,18 @@ class UnitLoader { public: virtual ~UnitLoader() = default; - // Must block until the unit is loaded. - // This call could unload other units. So the returned LoadUnit& is only - // guaranteed to remain loaded until the next call + /// Must block until the unit is loaded. This call could unload other units. + /// So the returned LoadUnit& is only guaranteed to remain loaded until the + /// next call. virtual LoadUnit& getLoadedUnit(uint32_t unit) = 0; - // Reader reports progress calling this method - // The call must be done **after** getLoadedUnit for unit + /// Reader reports progress calling this method. The call must be done + /// **after** getLoadedUnit for unit. virtual void onRead(uint32_t unit, uint64_t rowOffsetInUnit, uint64_t rowCount) = 0; - // Reader reports seek calling this method. - // The call must be done **before** getLoadedUnit for the new unit + /// Reader reports seek calling this method. The call must be done **before** + /// getLoadedUnit for the new unit. virtual void onSeek(uint32_t unit, uint64_t rowOffsetInUnit) = 0; }; diff --git a/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp b/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp index 2a07b4574646..19072dd2a55e 100644 --- a/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp +++ b/velox/dwio/common/tests/BitPackDecoderBenchmark.cpp @@ -257,7 +257,7 @@ void naiveDecodeBitsLE( template void legacyUnpackNaive(RowSet rows, uint8_t bitWidth, T* result) { auto data = bitPackedData[bitWidth].data(); - auto numBytes = bits::roundUp((rows.back() + 1) * bitWidth, 8) / 8; + auto numBytes = bits::divRoundUp((rows.back() + 1) * bitWidth, 8); auto end = reinterpret_cast(data) + numBytes; naiveDecodeBitsLE(data, 0, rows, 0, bitWidth, end, result32.data()); } @@ -265,7 +265,7 @@ void legacyUnpackNaive(RowSet rows, uint8_t bitWidth, T* result) { template void legacyUnpackFast(RowSet rows, uint8_t bitWidth, T* result) { auto data = bitPackedData[bitWidth].data(); - auto numBytes = bits::roundUp((rows.back() + 1) * bitWidth, 8) / 8; + auto numBytes = bits::divRoundUp((rows.back() + 1) * bitWidth, 8); auto end = reinterpret_cast(data) + numBytes; facebook::velox::dwio::common::unpack( data, @@ -503,7 +503,7 @@ BENCHMARK_UNPACK_ODDROWS_CASE_32(31) void populateBitPacked() { bitPackedData.resize(33); for (auto bitWidth = 1; bitWidth <= 32; ++bitWidth) { - auto numWords = bits::roundUp(randomInts_u32.size() * bitWidth, 64) / 64; + auto numWords = bits::divRoundUp(randomInts_u32.size() * bitWidth, 64); bitPackedData[bitWidth].resize(numWords); auto source = reinterpret_cast(randomInts_u32.data()); auto destination = @@ -549,8 +549,7 @@ void naiveDecodeBitsLE( bool anyUnsafe = false; if (bufferEnd) { const char* endByte = reinterpret_cast(bits) + - bits::roundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8) / - 8; + bits::divRoundUp(bitOffset + (rows.back() - rowBias + 1) * bitWidth, 8); // redzone is the number of bytes at the end of the accessed range that // could overflow the buffer if accessed 64 its wide. int64_t redZone = diff --git a/velox/dwio/common/tests/BitPackDecoderTest.cpp b/velox/dwio/common/tests/BitPackDecoderTest.cpp index 945a11caba9f..271fe8533784 100644 --- a/velox/dwio/common/tests/BitPackDecoderTest.cpp +++ b/velox/dwio/common/tests/BitPackDecoderTest.cpp @@ -45,7 +45,7 @@ class BitPackDecoderTest : public testing::Test { void populateBitPackedData() { bitPackedData_.resize(33); for (auto bitWidth = 1; bitWidth <= 32; ++bitWidth) { - auto numWords = bits::roundUp(randomInts_.size() * bitWidth, 64) / 64; + auto numWords = bits::divRoundUp(randomInts_.size() * bitWidth, 64); bitPackedData_[bitWidth].resize(numWords); auto source = randomInts_.data(); auto destination = diff --git a/velox/dwio/common/tests/OptionsTests.cpp b/velox/dwio/common/tests/OptionsTests.cpp index 7de98abb8843..82335821c521 100644 --- a/velox/dwio/common/tests/OptionsTests.cpp +++ b/velox/dwio/common/tests/OptionsTests.cpp @@ -22,7 +22,7 @@ using namespace facebook::velox::dwio::common; TEST(OptionsTests, defaultRowNumberColumnInfoTest) { // appendRowNumberColumn flag should be false by default RowReaderOptions rowReaderOptions; - ASSERT_EQ(std::nullopt, rowReaderOptions.getRowNumberColumnInfo()); + ASSERT_EQ(std::nullopt, rowReaderOptions.rowNumberColumnInfo()); } TEST(OptionsTests, setRowNumberColumnInfoTest) { @@ -31,7 +31,7 @@ TEST(OptionsTests, setRowNumberColumnInfoTest) { rowNumberColumnInfo.insertPosition = 0; rowNumberColumnInfo.name = "test"; rowReaderOptions.setRowNumberColumnInfo(rowNumberColumnInfo); - auto rowNumberColumn = rowReaderOptions.getRowNumberColumnInfo().value(); + auto rowNumberColumn = rowReaderOptions.rowNumberColumnInfo().value(); ASSERT_EQ(rowNumberColumnInfo.insertPosition, rowNumberColumn.insertPosition); ASSERT_EQ(rowNumberColumnInfo.name, rowNumberColumn.name); } @@ -39,7 +39,7 @@ TEST(OptionsTests, setRowNumberColumnInfoTest) { TEST(OptionsTests, testRowNumberColumnInfoInCopy) { RowReaderOptions rowReaderOptions; RowReaderOptions rowReaderOptionsCopy{rowReaderOptions}; - ASSERT_EQ(std::nullopt, rowReaderOptionsCopy.getRowNumberColumnInfo()); + ASSERT_EQ(std::nullopt, rowReaderOptionsCopy.rowNumberColumnInfo()); RowNumberColumnInfo rowNumberColumnInfo; rowNumberColumnInfo.insertPosition = 0; @@ -47,7 +47,7 @@ TEST(OptionsTests, testRowNumberColumnInfoInCopy) { rowReaderOptions.setRowNumberColumnInfo(rowNumberColumnInfo); RowReaderOptions rowReaderOptionsSecondCopy{rowReaderOptions}; auto rowNumberColumn = - rowReaderOptionsSecondCopy.getRowNumberColumnInfo().value(); + rowReaderOptionsSecondCopy.rowNumberColumnInfo().value(); ASSERT_EQ(rowNumberColumnInfo.insertPosition, rowNumberColumn.insertPosition); ASSERT_EQ(rowNumberColumnInfo.name, rowNumberColumn.name); } diff --git a/velox/dwio/dwrf/common/DecoderUtil.h b/velox/dwio/dwrf/common/DecoderUtil.h index 9c269a62f710..f1b100773d68 100644 --- a/velox/dwio/dwrf/common/DecoderUtil.h +++ b/velox/dwio/dwrf/common/DecoderUtil.h @@ -16,6 +16,7 @@ #pragma once +#include "velox/common/base/Exceptions.h" #include "velox/dwio/common/DirectDecoder.h" #include "velox/dwio/common/IntDecoder.h" #include "velox/dwio/dwrf/common/RLEv1.h" @@ -42,7 +43,7 @@ std::unique_ptr> createRleDecoder( case RleVersion_2: return std::make_unique>(std::move(input), pool); default: - DWIO_ENSURE(false, "not supported"); + VELOX_UNSUPPORTED("Not supported: {}", static_cast(version)); return {}; } } diff --git a/velox/dwio/dwrf/common/FileMetadata.h b/velox/dwio/dwrf/common/FileMetadata.h index 34e531a6b29d..3a9ab2767415 100644 --- a/velox/dwio/dwrf/common/FileMetadata.h +++ b/velox/dwio/dwrf/common/FileMetadata.h @@ -31,13 +31,6 @@ enum class DwrfFormat : uint8_t { }; class ProtoWrapperBase { - protected: - ProtoWrapperBase(DwrfFormat format, const void* impl) - : format_{format}, impl_{impl} {} - - DwrfFormat format_; - const void* impl_; - public: DwrfFormat format() const { return format_; @@ -46,6 +39,13 @@ class ProtoWrapperBase { inline const void* rawProtoPtr() const { return impl_; } + + protected: + ProtoWrapperBase(DwrfFormat format, const void* impl) + : format_{format}, impl_{impl} {} + + const DwrfFormat format_; + const void* const impl_; }; /*** diff --git a/velox/dwio/dwrf/common/Statistics.cpp b/velox/dwio/dwrf/common/Statistics.cpp index 8252a289a87d..57d8cea80c6f 100644 --- a/velox/dwio/dwrf/common/Statistics.cpp +++ b/velox/dwio/dwrf/common/Statistics.cpp @@ -42,21 +42,23 @@ std::unique_ptr buildColumnStatisticsFromProto( : std::nullopt, intStats.hasSum() ? std::optional(intStats.sum()) : std::nullopt); } else if (stats.hasDoubleStatistics()) { - const auto& dStats = stats.doubleStatistics(); + const auto& doubleStats = stats.doubleStatistics(); // Comparing against NaN doesn't make sense, and to prevent downstream // from incorrectly using it, need to make sure min/max/sum doens't have // NaN. - auto hasNan = (dStats.hasMinimum() && std::isnan(dStats.minimum())) || - (dStats.hasMaximum() && std::isnan(dStats.maximum())) || - (dStats.hasSum() && std::isnan(dStats.sum())); + const auto hasNan = + (doubleStats.hasMinimum() && std::isnan(doubleStats.minimum())) || + (doubleStats.hasMaximum() && std::isnan(doubleStats.maximum())) || + (doubleStats.hasSum() && std::isnan(doubleStats.sum())); if (!hasNan) { return std::make_unique( colStats, - dStats.hasMinimum() ? std::optional(dStats.minimum()) - : std::nullopt, - dStats.hasMaximum() ? std::optional(dStats.maximum()) - : std::nullopt, - dStats.hasSum() ? std::optional(dStats.sum()) : std::nullopt); + doubleStats.hasMinimum() ? std::optional(doubleStats.minimum()) + : std::nullopt, + doubleStats.hasMaximum() ? std::optional(doubleStats.maximum()) + : std::nullopt, + doubleStats.hasSum() ? std::optional(doubleStats.sum()) + : std::nullopt); } } else if (stats.hasStringStatistics()) { // DWRF_5_0 is the first version that string stats are saved as UTF8 diff --git a/velox/dwio/dwrf/reader/BinaryStreamReader.cpp b/velox/dwio/dwrf/reader/BinaryStreamReader.cpp index 1134091af62c..9d8f5a5bf05f 100644 --- a/velox/dwio/dwrf/reader/BinaryStreamReader.cpp +++ b/velox/dwio/dwrf/reader/BinaryStreamReader.cpp @@ -85,16 +85,16 @@ std::vector BinaryStripeStreams::getStreamIdentifiers( BinaryStreamReader::BinaryStreamReader( const std::shared_ptr& reader, const std::vector& columnIds) - : stripeReaderBase_{reader}, - columnSelector_{reader->getSchema(), columnIds}, - stripeIndex_{0}, - numStripes{folly::to(reader->getFooter().stripesSize())} { - DWIO_ENSURE(!reader->getFooter().hasEncryption(), "encryption not supported"); - DWIO_ENSURE(!columnIds.empty(), "Atleast one column expected to be read"); + : columnSelector_{reader->schema(), columnIds}, + numStripes_{folly::to(reader->footer().stripesSize())}, + stripeReaderBase_{reader}, + stripeIndex_{0} { + VELOX_CHECK(!reader->footer().hasEncryption(), "encryption not supported"); + VELOX_CHECK(!columnIds.empty(), "Atleast one column expected to be read"); } std::unique_ptr BinaryStreamReader::next() { - if (stripeIndex_ >= numStripes) { + if (stripeIndex_ >= numStripes_) { return nullptr; } return std::make_unique( @@ -105,12 +105,12 @@ std::unordered_map BinaryStreamReader::getStatistics() const { std::unordered_map stats; auto footerStatsSize = - stripeReaderBase_.getReader().getFooter().statisticsSize(); - auto typesSize = stripeReaderBase_.getReader().getFooter().typesSize(); + stripeReaderBase_.getReader().footer().statisticsSize(); + auto typesSize = stripeReaderBase_.getReader().footer().typesSize(); if (footerStatsSize == 0) { - DWIO_ENSURE_EQ( - numStripes, + VELOX_CHECK_EQ( + numStripes_, 0, "Corrupted file detected, Footer stats are missing, but stripes are present"); for (auto node = 0; node < typesSize; node++) { @@ -119,14 +119,14 @@ BinaryStreamReader::getStatistics() const { } } } else { - DWIO_ENSURE_EQ( + VELOX_CHECK_EQ( footerStatsSize, typesSize, "different number of nodes and statistics"); // Node 0 is always selected by ColumnSelector, though this can be // disabled for the current use cases. for (auto node = 0; node < footerStatsSize; node++) { if (columnSelector_.shouldReadNode(node)) { stats[node] = - stripeReaderBase_.getReader().getFooter().dwrfStatistics(node); + stripeReaderBase_.getReader().footer().dwrfStatistics(node); } } } @@ -134,7 +134,7 @@ BinaryStreamReader::getStatistics() const { } uint32_t BinaryStreamReader::getStrideLen() const { - return stripeReaderBase_.getReader().getFooter().rowIndexStride(); + return stripeReaderBase_.getReader().footer().rowIndexStride(); } } // namespace facebook::velox::dwrf::detail diff --git a/velox/dwio/dwrf/reader/BinaryStreamReader.h b/velox/dwio/dwrf/reader/BinaryStreamReader.h index 5da500efc8e3..d418bfa48c1f 100644 --- a/velox/dwio/dwrf/reader/BinaryStreamReader.h +++ b/velox/dwio/dwrf/reader/BinaryStreamReader.h @@ -67,7 +67,7 @@ class BinaryStripeStreams { class BinaryStreamReader { public: - explicit BinaryStreamReader( + BinaryStreamReader( const std::shared_ptr& reader, const std::vector& columnIds); @@ -81,13 +81,15 @@ class BinaryStreamReader { return stripeIndex_; } + uint32_t numStripes() const { + return numStripes_; + } + private: - StripeReaderBase stripeReaderBase_; const dwio::common::ColumnSelector columnSelector_; + const uint32_t numStripes_; + StripeReaderBase stripeReaderBase_; uint32_t stripeIndex_; - - public: - const uint32_t numStripes; }; } // namespace facebook::velox::dwrf::detail diff --git a/velox/dwio/dwrf/reader/ColumnReader.cpp b/velox/dwio/dwrf/reader/ColumnReader.cpp index f1df174dd4c2..9a11e9ebb567 100644 --- a/velox/dwio/dwrf/reader/ColumnReader.cpp +++ b/velox/dwio/dwrf/reader/ColumnReader.cpp @@ -1147,7 +1147,7 @@ StringDictionaryColumnReader::StringDictionaryColumnReader( encodingKey.forKind(proto::Stream_Kind_DICTIONARY_DATA), streamLabels.label(), false)), - returnFlatVector_(stripe.getRowReaderOptions().getReturnFlatVector()) { + returnFlatVector_(stripe.rowReaderOptions().returnFlatVector()) { MakeRleDecoderParams params{ .encodingKey = encodingKey, .stripe = stripe, @@ -1814,7 +1814,7 @@ StructColumnReader::StructColumnReader( // count the number of selected sub-columns const auto& cs = stripe.getColumnSelector(); - auto project = stripe.getRowReaderOptions().getProjectSelectedType(); + auto project = stripe.rowReaderOptions().projectSelectedType(); for (uint64_t i = 0; i < requestedType_->size(); ++i) { auto& child = requestedType_->childAt(i); diff --git a/velox/dwio/dwrf/reader/DwrfData.cpp b/velox/dwio/dwrf/reader/DwrfData.cpp index ec2856d42520..26116449e10a 100644 --- a/velox/dwio/dwrf/reader/DwrfData.cpp +++ b/velox/dwio/dwrf/reader/DwrfData.cpp @@ -53,23 +53,25 @@ uint64_t DwrfData::skipNulls(uint64_t numValues, bool /*nullsOnly*/) { if (!notNullDecoder_ && !flatMapContext_.inMapDecoder) { return numValues; } + // Page through the values that we want to skip and count how many are // non-null. constexpr uint64_t kBufferBytes = 1024; constexpr auto kBitCount = kBufferBytes * 8; - auto countNulls = [](ByteRleDecoder& decoder, size_t size) { + const auto countNulls = [](ByteRleDecoder& decoder, size_t size) { char buffer[kBufferBytes]; decoder.next(buffer, size, nullptr); return bits::countNulls(reinterpret_cast(buffer), 0, size); }; + uint64_t remaining = numValues; while (remaining > 0) { - uint64_t chunkSize = std::min(remaining, kBitCount); - uint64_t nullCount; - if (flatMapContext_.inMapDecoder) { + const uint64_t chunkSize = std::min(remaining, kBitCount); + uint64_t nullCount{0}; + if (flatMapContext_.inMapDecoder != nullptr) { nullCount = countNulls(*flatMapContext_.inMapDecoder, chunkSize); if (notNullDecoder_) { - if (auto inMapSize = chunkSize - nullCount; inMapSize > 0) { + if (const auto inMapSize = chunkSize - nullCount; inMapSize > 0) { nullCount += countNulls(*notNullDecoder_, inMapSize); } } @@ -91,8 +93,9 @@ void DwrfData::ensureRowGroupIndex() { dwio::common::PositionProvider DwrfData::seekToRowGroup(uint32_t index) { ensureRowGroupIndex(); - tempPositions_ = toPositionsInner(index_->entry(index)); - dwio::common::PositionProvider positionProvider(tempPositions_); + + positionsHolder_ = toPositionsInner(index_->entry(index)); + dwio::common::PositionProvider positionProvider(positionsHolder_); if (flatMapContext_.inMapDecoder) { flatMapContext_.inMapDecoder->seekToRowGroup(positionProvider); } @@ -113,11 +116,13 @@ void DwrfData::readNulls( nulls = nullptr; return; } - auto numBytes = bits::nbytes(numValues); + + const auto numBytes = bits::nbytes(numValues); if (!nulls || nulls->capacity() < numBytes) { nulls = AlignedBuffer::allocate(numBytes, &memoryPool_); } nulls->setSize(numBytes); + auto* nullsPtr = nulls->asMutable(); if (flatMapContext_.inMapDecoder) { if (notNullDecoder_) { @@ -132,7 +137,7 @@ void DwrfData::readNulls( } else if (notNullDecoder_) { notNullDecoder_->next(nullsPtr, numValues, incomingNulls); } else { - memcpy(nullsPtr, incomingNulls, numBytes); + ::memcpy(nullsPtr, incomingNulls, numBytes); } } @@ -144,22 +149,25 @@ void DwrfData::filterRowGroups( if (!index_ && !indexStream_) { return; } + ensureRowGroupIndex(); - auto filter = scanSpec.filter(); - auto dwrfContext = reinterpret_cast(&writerContext); + auto* filter = scanSpec.filter(); + auto* dwrfContext = reinterpret_cast(&writerContext); result.totalCount = std::max(result.totalCount, index_->entry_size()); - auto nwords = bits::nwords(result.totalCount); + const auto nwords = bits::nwords(result.totalCount); if (result.filterResult.size() < nwords) { result.filterResult.resize(nwords); } - auto metadataFiltersStartIndex = result.metadataFilterResults.size(); + + const auto metadataFiltersStartIndex = result.metadataFilterResults.size(); for (int i = 0; i < scanSpec.numMetadataFilters(); ++i) { result.metadataFilterResults.emplace_back( scanSpec.metadataFilterNodeAt(i), std::vector(nwords)); } - for (auto i = 0; i < index_->entry_size(); i++) { + + for (auto i = 0; i < index_->entry_size(); ++i) { const auto& entry = index_->entry(i); - auto columnStats = buildColumnStatisticsFromProto( + const auto columnStats = buildColumnStatisticsFromProto( ColumnStatisticsWrapper(&entry.statistics()), *dwrfContext); if (filter && !testFilter( @@ -168,6 +176,7 @@ void DwrfData::filterRowGroups( bits::setBit(result.filterResult.data(), i); continue; } + for (int j = 0; j < scanSpec.numMetadataFilters(); ++j) { auto* metadataFilter = scanSpec.metadataFilterAt(j); if (!testFilter( diff --git a/velox/dwio/dwrf/reader/DwrfData.h b/velox/dwio/dwrf/reader/DwrfData.h index 5e847c9132f0..995cc80ac410 100644 --- a/velox/dwio/dwrf/reader/DwrfData.h +++ b/velox/dwio/dwrf/reader/DwrfData.h @@ -54,7 +54,7 @@ class DwrfData : public dwio::common::FormatData { const common::ScanSpec& scanSpec, uint64_t rowsPerRowGroup, const dwio::common::StatsContext& writerContext, - FilterRowGroupsResult&) override; + FilterRowGroupsResult& result) override; bool hasNulls() const override { return notNullDecoder_ != nullptr; @@ -72,8 +72,8 @@ class DwrfData : public dwio::common::FormatData { return flatMapContext_.inMapDecoder ? inMap_->as() : nullptr; } - // seeks possible flat map in map streams and nulls to the row group - // and returns a PositionsProvider for the other streams. + /// Seeks possible flat map in map streams and nulls to the row group + /// and returns a PositionsProvider for the other streams. dwio::common::PositionProvider seekToRowGroup(uint32_t index) override; int64_t stripeRows() const { @@ -102,7 +102,7 @@ class DwrfData : public dwio::common::FormatData { memory::MemoryPool& memoryPool_; const std::shared_ptr fileType_; FlatMapContext flatMapContext_; - std::unique_ptr notNullDecoder_; + std::unique_ptr notNullDecoder_; std::unique_ptr indexStream_; std::unique_ptr index_; int64_t stripeRows_; @@ -111,13 +111,13 @@ class DwrfData : public dwio::common::FormatData { // Storage for positions backing a PositionProvider returned from // seekToRowGroup(). - std::vector tempPositions_; + std::vector positionsHolder_; // In map bitmap used in flat map encoding. BufferPtr inMap_; }; -// DWRF specific initialization. +/// DWRF specific initialization. class DwrfParams : public dwio::common::FormatParams { public: explicit DwrfParams( @@ -126,9 +126,9 @@ class DwrfParams : public dwio::common::FormatParams { dwio::common::ColumnReaderStatistics& stats, FlatMapContext context = {}) : FormatParams(stripeStreams.getMemoryPool(), stats), + streamLabels_(streamLabels), stripeStreams_(stripeStreams), - flatMapContext_(context), - streamLabels_(streamLabels) {} + flatMapContext_(context) {} std::unique_ptr toFormatData( const std::shared_ptr& type, @@ -150,9 +150,9 @@ class DwrfParams : public dwio::common::FormatParams { } private: + const StreamLabels& streamLabels_; StripeStreams& stripeStreams_; FlatMapContext flatMapContext_; - const StreamLabels& streamLabels_; }; inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) { @@ -164,7 +164,9 @@ inline RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) { case proto::ColumnEncoding_Kind_DICTIONARY_V2: return RleVersion_2; default: - DWIO_RAISE("Unknown encoding in convertRleVersion"); + VELOX_FAIL( + "Unknown encoding in convertRleVersion: {}", + static_cast(kind)); } } diff --git a/velox/dwio/dwrf/reader/DwrfReader.cpp b/velox/dwio/dwrf/reader/DwrfReader.cpp index 5f27261f270b..d20d67bbb0a9 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.cpp +++ b/velox/dwio/dwrf/reader/DwrfReader.cpp @@ -47,26 +47,26 @@ class DwrfUnit : public LoadUnit { RowReaderOptions options) : stripeReaderBase_{stripeReaderBase}, strideIndexProvider_{strideIndexProvider}, - columnReaderStatistics_{columnReaderStatistics}, + columnReaderStatistics_{&columnReaderStatistics}, stripeIndex_{stripeIndex}, columnSelector_{std::move(columnSelector)}, projectedNodes_{projectedNodes}, options_{std::move(options)}, stripeInfo_{ - stripeReaderBase.getReader().getFooter().stripes(stripeIndex_)} {} + stripeReaderBase.getReader().footer().stripes(stripeIndex_)} {} ~DwrfUnit() override = default; - // Perform the IO (read) + /// Performs the IO (read) void load() override; - // Unload the unit to free memory + /// Unloads the unit to free memory void unload() override; - // Number of rows in the unit + /// Number of rows in the unit uint64_t getNumRows() override; - // Number of bytes that the IO will read + /// Number of bytes that the IO will read uint64_t getIoSize() override; std::unique_ptr& getColumnReader() { @@ -85,10 +85,10 @@ class DwrfUnit : public LoadUnit { // Immutables const StripeReaderBase& stripeReaderBase_; const StrideIndexProvider& strideIndexProvider_; - dwio::common::ColumnReaderStatistics& columnReaderStatistics_; + dwio::common::ColumnReaderStatistics* const columnReaderStatistics_; const uint32_t stripeIndex_; const std::shared_ptr columnSelector_; - std::shared_ptr projectedNodes_; + const std::shared_ptr projectedNodes_; const RowReaderOptions options_; const StripeInformationWrapper stripeInfo_; @@ -135,7 +135,7 @@ void DwrfUnit::ensureDecoders() { return; } - preloaded_ = options_.getPreloadStripe(); + preloaded_ = options_.preloadStripe(); stripeReadState_ = std::make_shared( stripeReaderBase_.readerBaseShared(), @@ -151,11 +151,11 @@ void DwrfUnit::ensureDecoders() { strideIndexProvider_, stripeIndex_); - auto scanSpec = options_.getScanSpec().get(); - auto fileType = stripeReaderBase_.getReader().getSchemaWithId(); + auto* scanSpec = options_.scanSpec().get(); + const auto& fileType = stripeReaderBase_.getReader().schemaWithId(); FlatMapContext flatMapContext; - flatMapContext.keySelectionCallback = options_.getKeySelectionCallback(); - memory::AllocationPool pool(&stripeReaderBase_.getReader().getMemoryPool()); + flatMapContext.keySelectionCallback = options_.keySelectionCallback(); + memory::AllocationPool pool(&stripeReaderBase_.getReader().memoryPool()); StreamLabels streamLabels(pool); if (scanSpec) { @@ -164,13 +164,13 @@ void DwrfUnit::ensureDecoders() { fileType, *stripeStreams_, streamLabels, - columnReaderStatistics_, + *columnReaderStatistics_, scanSpec, flatMapContext, - true); // isRoot + /*isRoot=*/true); selectiveColumnReader_->setIsTopLevel(); selectiveColumnReader_->setFillMutatedOutputRows( - options_.getRowNumberColumnInfo().has_value()); + options_.rowNumberColumnInfo().has_value()); } else { auto requestedType = columnSelector_->getSchemaWithId(); auto factory = &ColumnReaderFactory::defaultFactory(); @@ -183,19 +183,20 @@ void DwrfUnit::ensureDecoders() { fileType, *stripeStreams_, streamLabels, - options_.getDecodingExecutor().get(), - options_.getDecodingParallelismFactor(), + options_.decodingExecutor().get(), + options_.decodingParallelismFactor(), flatMapContext); } - DWIO_ENSURE( - (columnReader_ != nullptr) != (selectiveColumnReader_ != nullptr), + + VELOX_CHECK_NE( + columnReader_ != nullptr, + selectiveColumnReader_ != nullptr, "ColumnReader was not created"); } void DwrfUnit::loadDecoders() { - // load data plan according to its updated selector - // during column reader construction - // if planReads is off which means stripe data loaded as whole + // load data plan according to its updated selector during column reader + // construction if planReads is off which means stripe data loaded as whole. if (!preloaded_) { VLOG(1) << "[DWRF] Load read plan for stripe " << stripeIndex_; stripeStreams_->loadReadPlan(); @@ -207,9 +208,9 @@ void DwrfUnit::loadDecoders() { namespace { DwrfUnit* castDwrfUnit(LoadUnit* unit) { - VELOX_CHECK(unit != nullptr); + VELOX_CHECK_NOT_NULL(unit); auto* dwrfUnit = dynamic_cast(unit); - VELOX_CHECK(dwrfUnit != nullptr); + VELOX_CHECK_NOT_NULL(dwrfUnit); return dwrfUnit; } @@ -230,37 +231,33 @@ DwrfRowReader::DwrfRowReader( const std::shared_ptr& reader, const RowReaderOptions& opts) : StripeReaderBase(reader), - strideIndex_{0}, options_(opts), - decodingTimeCallback_{options_.getDecodingTimeCallback()}, columnSelector_{ - options_.getScanSpec() + options_.scanSpec() != nullptr ? nullptr : std::make_shared(ColumnSelector::apply( - opts.getSelector(), - reader->getSchema()))}, + options_.selector(), + reader->schema()))}, + decodingTimeCallback_{options_.decodingTimeCallback()}, + strideIndex_{0}, currentUnit_{nullptr} { - auto& fileFooter = getReader().getFooter(); - uint32_t numberOfStripes = fileFooter.stripesSize(); + const auto& fileFooter = getReader().footer(); + const uint32_t numberOfStripes = fileFooter.stripesSize(); currentStripe_ = numberOfStripes; stripeCeiling_ = 0; currentRowInStripe_ = 0; rowsInCurrentStripe_ = 0; - uint64_t rowTotal = 0; + uint64_t numRows{0}; firstRowOfStripe_.reserve(numberOfStripes); for (uint32_t i = 0; i < numberOfStripes; ++i) { - firstRowOfStripe_.push_back(rowTotal); - auto stripeInfo = fileFooter.stripes(i); - rowTotal += stripeInfo.numberOfRows(); - if ((stripeInfo.offset() >= opts.getOffset()) && - (stripeInfo.offset() < opts.getLimit())) { - if (i < currentStripe_) { - currentStripe_ = i; - } - if (i >= stripeCeiling_) { - stripeCeiling_ = i + 1; - } + firstRowOfStripe_.push_back(numRows); + const auto stripeInfo = fileFooter.stripes(i); + numRows += stripeInfo.numberOfRows(); + if ((stripeInfo.offset() >= options_.offset()) && + (stripeInfo.offset() < options_.limit())) { + currentStripe_ = std::min(currentStripe_, i); + stripeCeiling_ = std::max(stripeCeiling_, i + 1); } } firstStripe_ = currentStripe_; @@ -272,7 +269,7 @@ DwrfRowReader::DwrfRowReader( stripeCeiling_ = firstStripe_; } - auto stripeCountCallback = options_.getStripeCountCallback(); + const auto stripeCountCallback = options_.stripeCountCallback(); if (stripeCountCallback) { stripeCountCallback(stripeCeiling_ - firstStripe_); } @@ -287,23 +284,22 @@ DwrfRowReader::DwrfRowReader( // Validate the requested type is compatible with what's in the file std::function createExceptionContext = [&]() { - std::string exceptionMessageContext = fmt::format( + return fmt::format( "The schema loaded in the reader does not match the schema in the file footer." "Input Name: {},\n" "File Footer Schema (without partition columns): {},\n" "Input Table Schema (with partition columns): {}\n", - getReader().getBufferedInput().getName(), - getReader().getSchema()->toString(), - getType()->toString()); - return exceptionMessageContext; + getReader().bufferedInput().getName(), + getReader().schema()->toString(), + type()->toString()); }; if (columnSelector_) { dwio::common::typeutils::checkTypeCompatibility( - *getReader().getSchema(), *columnSelector_, createExceptionContext); + *getReader().schema(), *columnSelector_, createExceptionContext); } else { projectedNodes_ = std::make_shared(0); - makeProjectedNodes(*getReader().getSchemaWithId(), *projectedNodes_); + makeProjectedNodes(*getReader().schemaWithId(), *projectedNodes_); } unitLoader_ = getUnitLoader(); @@ -323,10 +319,10 @@ DwrfRowReader::getSelectiveColumnReader() { std::unique_ptr DwrfRowReader::getUnitLoader() { std::vector> loadUnits; loadUnits.reserve(stripeCeiling_ - firstStripe_); - for (auto stripe = firstStripe_; stripe < stripeCeiling_; stripe++) { + for (auto stripe = firstStripe_; stripe < stripeCeiling_; ++stripe) { loadUnits.emplace_back(std::make_unique( - /* stripeReaderBase */ *this, - /* strideIndexProvider */ *this, + /*stripeReaderBase=*/*this, + /*strideIndexProvider=*/*this, columnReaderStatistics_, stripe, columnSelector_, @@ -334,11 +330,11 @@ std::unique_ptr DwrfRowReader::getUnitLoader() { options_)); } std::shared_ptr unitLoaderFactory = - options_.getUnitLoaderFactory(); + options_.unitLoaderFactory(); if (!unitLoaderFactory) { unitLoaderFactory = std::make_shared( - options_.getBlockedOnIoCallback()); + options_.blockedOnIoCallback()); } return unitLoaderFactory->create(std::move(loadUnits), 0); } @@ -350,23 +346,22 @@ uint64_t DwrfRowReader::seekToRow(uint64_t rowNumber) { } nextRowNumber_.reset(); - // If we are reading only a portion of the file - // (bounded by firstStripe_ and stripeCeiling_), - // seeking before or after the portion of interest should return no data. - // Implement this by setting previousRow_ to the number of rows in the file. + // If we are reading only a portion of the file (bounded by firstStripe_ and + // stripeCeiling_), seeking before or after the portion of interest should + // return no data. Implement this by setting previousRow_ to the number of + // rows in the file. - // seeking past stripeCeiling_ - auto& fileFooter = getReader().getFooter(); - uint32_t num_stripes = fileFooter.stripesSize(); - if ((stripeCeiling_ == num_stripes && + // seeking past stripeCeiling_. + const auto& fileFooter = getReader().footer(); + const uint32_t numStripes = fileFooter.stripesSize(); + if ((stripeCeiling_ == numStripes && rowNumber >= fileFooter.numberOfRows()) || - (stripeCeiling_ < num_stripes && + (stripeCeiling_ < numStripes && rowNumber >= firstRowOfStripe_[stripeCeiling_])) { VLOG(1) << "Trying to seek past stripeCeiling_, total rows: " - << fileFooter.numberOfRows() << " num_stripes: " << num_stripes; - - currentStripe_ = num_stripes; + << fileFooter.numberOfRows() << " num_stripes: " << numStripes; + currentStripe_ = numStripes; previousRow_ = fileFooter.numberOfRows(); return previousRow_; } @@ -374,12 +369,12 @@ uint64_t DwrfRowReader::seekToRow(uint64_t rowNumber) { uint32_t seekToStripe = 0; while (seekToStripe + 1 < stripeCeiling_ && firstRowOfStripe_[seekToStripe + 1] <= rowNumber) { - seekToStripe++; + ++seekToStripe; } // seeking before the first stripe if (seekToStripe < firstStripe_) { - currentStripe_ = num_stripes; + currentStripe_ = numStripes; previousRow_ = fileFooter.numberOfRows(); return previousRow_; } @@ -441,7 +436,7 @@ uint64_t DwrfRowReader::skipRows(uint64_t numberOfRowsToSkip) { } // when we skipped or exhausted the whole file we can return 0 - auto& fileFooter = getReader().getFooter(); + const auto& fileFooter = getReader().footer(); if (previousRow_ == fileFooter.numberOfRows()) { VLOG(1) << "previousRow_ is beyond EOF, nothing to skip"; return 0; @@ -462,7 +457,7 @@ uint64_t DwrfRowReader::skipRows(uint64_t numberOfRowsToSkip) { VLOG(1) << "Previous row: " << previousRow_ << " Skipping: " << numberOfRowsToSkip; - uint64_t initialRow = previousRow_; + const uint64_t initialRow = previousRow_; seekToRow(previousRow_ + numberOfRowsToSkip); VLOG(1) << "After skipping: " << previousRow_ @@ -482,11 +477,10 @@ void DwrfRowReader::checkSkipStrides(uint64_t strideSize) { } if (currentRowInStripe_ == 0 || recomputeStridesToSkip_) { - StatsContext context( - getReader().getWriterName(), getReader().getWriterVersion()); + StatsContext context(getReader().writerName(), getReader().writerVersion()); DwrfData::FilterRowGroupsResult res; getSelectiveColumnReader()->filterRowGroups(strideSize, context, res); - if (auto& metadataFilter = options_.getMetadataFilter()) { + if (auto& metadataFilter = options_.metadataFilter()) { metadataFilter->eval(res.metadataFilterResults, res.filterResult); } stridesToSkip_ = res.filterResult.data(); @@ -502,8 +496,8 @@ void DwrfRowReader::checkSkipStrides(uint64_t strideSize) { foundStridesToSkip = true; currentRowInStripe_ = std::min(currentRowInStripe_ + strideSize, rowsInCurrentStripe_); - currentStride++; - skippedStrides_++; + ++currentStride; + ++skippedStrides_; } if (foundStridesToSkip && currentRowInStripe_ < rowsInCurrentStripe_) { getSelectiveColumnReader()->seekToRowGroup(currentStride); @@ -523,9 +517,8 @@ void DwrfRowReader::readNext( } // TODO: Move row number appending logic here. Currently this is done in // the wrapper reader. - VELOX_CHECK( - mutation == nullptr, - "Mutation pushdown is only supported in selective reader"); + VELOX_CHECK_NULL( + mutation, "Mutation pushdown is only supported in selective reader"); getColumnReader()->next(rowsToRead, result); if (startTime.has_value()) { decodingTimeCallback_( @@ -533,10 +526,12 @@ void DwrfRowReader::readNext( } return; } - if (!options_.getRowNumberColumnInfo().has_value()) { + + if (!options_.rowNumberColumnInfo().has_value()) { getSelectiveColumnReader()->next(rowsToRead, result, mutation); return; } + readWithRowNumber( getSelectiveColumnReader(), options_, @@ -558,32 +553,37 @@ int64_t DwrfRowReader::nextRowNumber() { if (nextRowNumber_.has_value()) { return *nextRowNumber_; } - auto strideSize = getReader().getFooter().rowIndexStride(); + + const auto strideSize = getReader().footer().rowIndexStride(); while (currentStripe_ < stripeCeiling_) { if (currentRowInStripe_ == 0) { if (getReader().randomSkip()) { - auto numStripeRows = - getReader().getFooter().stripes(currentStripe_).numberOfRows(); - auto skip = getReader().randomSkip()->nextSkip(); - if (skip >= numStripeRows) { + const auto numStripeRows = + getReader().footer().stripes(currentStripe_).numberOfRows(); + const auto skipRows = getReader().randomSkip()->nextSkip(); + if (skipRows >= numStripeRows) { getReader().randomSkip()->consume(numStripeRows); - auto numStrides = (numStripeRows + strideSize - 1) / strideSize; + const auto numStrides = bits::divRoundUp(numStripeRows, strideSize); skippedStrides_ += numStrides; goto advanceToNextStripe; } } loadCurrentStripe(); + VELOX_CHECK_NOT_NULL(currentUnit_); } + checkSkipStrides(strideSize); if (currentRowInStripe_ < rowsInCurrentStripe_) { nextRowNumber_ = firstRowOfStripe_[currentStripe_] + currentRowInStripe_; return *nextRowNumber_; } + advanceToNextStripe: ++currentStripe_; currentRowInStripe_ = 0; currentUnit_ = nullptr; } + nextRowNumber_ = kAtEnd; return kAtEnd; } @@ -593,8 +593,9 @@ int64_t DwrfRowReader::nextReadSize(uint64_t size) { if (nextRowNumber() == kAtEnd) { return kAtEnd; } + auto rowsToRead = std::min(size, rowsInCurrentStripe_ - currentRowInStripe_); - auto strideSize = getReader().getFooter().rowIndexStride(); + const auto strideSize = getReader().footer().rowIndexStride(); if (LIKELY(strideSize > 0)) { // Don't allow read to cross stride. rowsToRead = @@ -612,18 +613,19 @@ uint64_t DwrfRowReader::next( if (nextRow == kAtEnd) { if (!isEmptyFile()) { previousRow_ = firstRowOfStripe_[stripeCeiling_ - 1] + - getReader().getFooter().stripes(stripeCeiling_ - 1).numberOfRows(); + getReader().footer().stripes(stripeCeiling_ - 1).numberOfRows(); } else { previousRow_ = 0; } return 0; } - auto rowsToRead = nextReadSize(size); + + const auto rowsToRead = nextReadSize(size); nextRowNumber_.reset(); previousRow_ = nextRow; // Record strideIndex for use by the columnReader_ which may delay actual // reading of the data. - auto strideSize = getReader().getFooter().rowIndexStride(); + const auto strideSize = getReader().footer().rowIndexStride(); strideIndex_ = strideSize > 0 ? currentRowInStripe_ / strideSize : 0; const auto loadUnitIdx = currentStripe_ - firstStripe_; unitLoader_->onRead(loadUnitIdx, currentRowInStripe_, rowsToRead); @@ -645,6 +647,7 @@ void DwrfRowReader::loadCurrentStripe() { if (currentUnit_ || currentStripe_ >= stripeCeiling_) { return; } + VELOX_CHECK_GE(currentStripe_, firstStripe_); strideIndex_ = 0; const auto loadUnitIdx = currentStripe_ - firstStripe_; @@ -685,18 +688,18 @@ std::optional DwrfRowReader::estimatedRowSizeHelper( const FooterWrapper& fileFooter, const dwio::common::Statistics& stats, uint32_t nodeId) const { - DWIO_ENSURE_LT(nodeId, fileFooter.typesSize(), "Types missing in footer"); + VELOX_CHECK_LT(nodeId, fileFooter.typesSize(), "Types missing in footer"); - const auto& s = stats.getColumnStatistics(nodeId); - const auto& t = fileFooter.types(nodeId); - if (!s.getNumberOfValues()) { + const auto& nodeStats = stats.getColumnStatistics(nodeId); + const auto& nodeType = fileFooter.types(nodeId); + if (!nodeStats.getNumberOfValues()) { return std::nullopt; } - auto valueCount = s.getNumberOfValues().value(); + const auto valueCount = nodeStats.getNumberOfValues().value(); if (valueCount < 1) { return 0; } - switch (t.kind()) { + switch (nodeType.kind()) { case TypeKind::BOOLEAN: { return valueCount * sizeof(uint8_t); } @@ -723,10 +726,10 @@ std::optional DwrfRowReader::estimatedRowSizeHelper( } case TypeKind::VARCHAR: return getStringOrBinaryColumnSize( - s); + nodeStats); case TypeKind::VARBINARY: return getStringOrBinaryColumnSize( - s); + nodeStats); case TypeKind::TIMESTAMP: { return valueCount * sizeof(uint64_t) * 2; } @@ -735,12 +738,12 @@ std::optional DwrfRowReader::estimatedRowSizeHelper( case TypeKind::ROW: { // start the estimate with the offsets and hasNulls vectors sizes size_t totalEstimate = valueCount * (sizeof(uint8_t) + sizeof(uint64_t)); - for (int32_t i = 0; i < t.subtypesSize(); ++i) { - if (!shouldReadNode(t.subtypes(i))) { + for (int32_t i = 0; i < nodeType.subtypesSize(); ++i) { + if (!shouldReadNode(nodeType.subtypes(i))) { continue; } - auto subtypeEstimate = - estimatedRowSizeHelper(fileFooter, stats, t.subtypes(i)); + const auto subtypeEstimate = + estimatedRowSizeHelper(fileFooter, stats, nodeType.subtypes(i)); if (subtypeEstimate.has_value()) { totalEstimate += subtypeEstimate.value(); } else { @@ -755,8 +758,8 @@ std::optional DwrfRowReader::estimatedRowSizeHelper( } std::optional DwrfRowReader::estimatedRowSize() const { - auto& reader = getReader(); - auto& fileFooter = reader.getFooter(); + const auto& reader = getReader(); + const auto& fileFooter = reader.footer(); if (!fileFooter.hasNumberOfRows()) { return std::nullopt; @@ -766,14 +769,14 @@ std::optional DwrfRowReader::estimatedRowSize() const { return 0; } - // Estimate with projections + // Estimate with projections. constexpr uint32_t ROOT_NODE_ID = 0; - auto stats = reader.getStatistics(); - auto projectedSize = estimatedRowSizeHelper(fileFooter, *stats, ROOT_NODE_ID); + const auto stats = reader.statistics(); + const auto projectedSize = + estimatedRowSizeHelper(fileFooter, *stats, ROOT_NODE_ID); if (projectedSize.has_value()) { return projectedSize.value() / fileFooter.numberOfRows(); } - return std::nullopt; } @@ -826,8 +829,8 @@ TypePtr updateColumnNames( // Non-primitive type tree recursion function. template TypePtr updateColumnNames(const TypePtr& fileType, const TypePtr& tableType) { - auto fileRowType = std::dynamic_pointer_cast(fileType); - auto tableRowType = std::dynamic_pointer_cast(tableType); + const auto fileRowType = std::dynamic_pointer_cast(fileType); + const auto tableRowType = std::dynamic_pointer_cast(tableType); std::vector newFileFieldNames; newFileFieldNames.reserve(fileRowType->size()); @@ -897,16 +900,16 @@ TypePtr updateColumnNames( void DwrfReader::updateColumnNamesFromTableSchema() { const auto& tableSchema = options_.fileSchema(); - const auto& fileSchema = readerBase_->getSchema(); + const auto& fileSchema = readerBase_->schema(); readerBase_->setSchema(std::dynamic_pointer_cast( updateColumnNames(fileSchema, tableSchema, "", ""))); } std::unique_ptr DwrfReader::getStripe( uint32_t stripeIndex) const { - DWIO_ENSURE_LT( + VELOX_CHECK_LT( stripeIndex, getNumberOfStripes(), "stripe index out of range"); - auto stripeInfo = readerBase_->getFooter().stripes(stripeIndex); + auto stripeInfo = readerBase_->footer().stripes(stripeIndex); return std::make_unique( stripeInfo.offset(), @@ -918,7 +921,7 @@ std::unique_ptr DwrfReader::getStripe( std::vector DwrfReader::getMetadataKeys() const { std::vector result; - auto& fileFooter = readerBase_->getFooter(); + auto& fileFooter = readerBase_->footer(); result.reserve(fileFooter.metadataSize()); for (int32_t i = 0; i < fileFooter.metadataSize(); ++i) { result.push_back(fileFooter.metadata(i).name()); @@ -927,17 +930,17 @@ std::vector DwrfReader::getMetadataKeys() const { } std::string DwrfReader::getMetadataValue(const std::string& key) const { - auto& fileFooter = readerBase_->getFooter(); + auto& fileFooter = readerBase_->footer(); for (int32_t i = 0; i < fileFooter.metadataSize(); ++i) { if (fileFooter.metadata(i).name() == key) { return fileFooter.metadata(i).value(); } } - DWIO_RAISE("key not found"); + VELOX_FAIL("key not found: {}", key); } bool DwrfReader::hasMetadataValue(const std::string& key) const { - auto& fileFooter = readerBase_->getFooter(); + auto& fileFooter = readerBase_->footer(); for (int32_t i = 0; i < fileFooter.metadataSize(); ++i) { if (fileFooter.metadata(i).name() == key) { return true; @@ -997,28 +1000,28 @@ uint64_t maxStreamsForType(const TypeWrapper& type) { } uint64_t DwrfReader::getMemoryUse(int32_t stripeIx) { - ColumnSelector cs(readerBase_->getSchema()); + ColumnSelector cs(readerBase_->schema()); return getMemoryUse(*readerBase_, stripeIx, cs); } uint64_t DwrfReader::getMemoryUseByFieldId( const std::vector& include, int32_t stripeIx) { - ColumnSelector cs(readerBase_->getSchema(), include); + ColumnSelector cs(readerBase_->schema(), include); return getMemoryUse(*readerBase_, stripeIx, cs); } uint64_t DwrfReader::getMemoryUseByName( const std::vector& names, int32_t stripeIx) { - ColumnSelector cs(readerBase_->getSchema(), names); + ColumnSelector cs(readerBase_->schema(), names); return getMemoryUse(*readerBase_, stripeIx, cs); } uint64_t DwrfReader::getMemoryUseByTypeId( const std::vector& include, int32_t stripeIx) { - ColumnSelector cs(readerBase_->getSchema(), include, true); + ColumnSelector cs(readerBase_->schema(), include, true); return getMemoryUse(*readerBase_, stripeIx, cs); } @@ -1027,27 +1030,23 @@ uint64_t DwrfReader::getMemoryUse( int32_t stripeIx, const ColumnSelector& cs) { uint64_t maxDataLength = 0; - auto& fileFooter = readerBase.getFooter(); + const auto& fileFooter = readerBase.footer(); if (stripeIx >= 0 && stripeIx < fileFooter.stripesSize()) { - uint64_t stripe = fileFooter.stripes(stripeIx).dataLength(); - if (maxDataLength < stripe) { - maxDataLength = stripe; - } + const uint64_t stripeLength = fileFooter.stripes(stripeIx).dataLength(); + maxDataLength = std::max(maxDataLength, stripeLength); } else { - for (int32_t i = 0; i < fileFooter.stripesSize(); i++) { - uint64_t stripe = fileFooter.stripes(i).dataLength(); - if (maxDataLength < stripe) { - maxDataLength = stripe; - } + for (int32_t i = 0; i < fileFooter.stripesSize(); ++i) { + const uint64_t stripeLength = fileFooter.stripes(i).dataLength(); + maxDataLength = std::max(maxDataLength, stripeLength); } } bool hasStringColumn = false; - uint64_t nSelectedStreams = 0; - for (int32_t i = 0; !hasStringColumn && i < fileFooter.typesSize(); i++) { + uint64_t numSelectedStreams = 0; + for (int32_t i = 0; !hasStringColumn && i < fileFooter.typesSize(); ++i) { if (cs.shouldReadNode(i)) { const auto type = fileFooter.types(i); - nSelectedStreams += maxStreamsForType(type); + numSelectedStreams += maxStreamsForType(type); switch (type.kind()) { case TypeKind::VARCHAR: case TypeKind::VARBINARY: { @@ -1061,46 +1060,45 @@ uint64_t DwrfReader::getMemoryUse( } } - /* If a string column is read, use stripe datalength as a memory estimate - * because we don't know the dictionary size. Multiply by 2 because - * a string column requires two buffers: - * in the input stream and in the seekable input stream. - * If no string column is read, estimate from the number of streams. - */ - uint64_t memory = hasStringColumn ? 2 * maxDataLength - : std::min( - uint64_t(maxDataLength), - nSelectedStreams * - readerBase.getBufferedInput() - .getReadFile() - ->getNaturalReadSize()); + // If a string column is read, use stripe datalength as a memory estimate + // because we don't know the dictionary size. Multiply by 2 because a string + // column requires two buffers: in the input stream and in the seekable + // input stream. If no string column is read, estimate from the number of + // streams. + uint64_t memoryBytes = hasStringColumn + ? 2 * maxDataLength + : std::min( + uint64_t(maxDataLength), + numSelectedStreams * + readerBase.bufferedInput().getReadFile()->getNaturalReadSize()); // Do we need even more memory to read the footer or the metadata? - auto footerLength = readerBase.getPostScript().footerLength(); - if (memory < footerLength + readerBase.getFooterEstimatedSize()) { - memory = footerLength + readerBase.getFooterEstimatedSize(); + const auto footerLength = readerBase.postScript().footerLength(); + if (memoryBytes < footerLength + readerBase.footerEstimatedSize()) { + memoryBytes = footerLength + readerBase.footerEstimatedSize(); } // Account for firstRowOfStripe. - memory += static_cast(fileFooter.stripesSize()) * sizeof(uint64_t); + memoryBytes += + static_cast(fileFooter.stripesSize()) * sizeof(uint64_t); // Decompressors need buffers for each stream - uint64_t decompressorMemory = 0; - auto compression = readerBase.getCompressionKind(); - if (compression != common::CompressionKind_NONE) { - for (int32_t i = 0; i < fileFooter.typesSize(); i++) { + uint64_t decompressorMemoryBytes = 0; + const auto compressionKind = readerBase.compressionKind(); + if (compressionKind != common::CompressionKind_NONE) { + for (int32_t i = 0; i < fileFooter.typesSize(); ++i) { if (cs.shouldReadNode(i)) { const auto type = fileFooter.types(i); - decompressorMemory += - maxStreamsForType(type) * readerBase.getCompressionBlockSize(); + decompressorMemoryBytes += + maxStreamsForType(type) * readerBase.compressionBlockSize(); } } - if (compression == common::CompressionKind_SNAPPY) { - decompressorMemory *= 2; // Snappy decompressor uses a second buffer + if (compressionKind == common::CompressionKind_SNAPPY) { + decompressorMemoryBytes *= 2; // Snappy decompressor uses a second buffer } } - return memory + decompressorMemory; + return memoryBytes + decompressorMemoryBytes; } std::unique_ptr DwrfReader::createRowReader( @@ -1111,7 +1109,7 @@ std::unique_ptr DwrfReader::createRowReader( std::unique_ptr DwrfReader::createDwrfRowReader( const RowReaderOptions& opts) const { auto rowReader = std::make_unique(readerBase_, opts); - if (opts.getEagerFirstStripeLoad()) { + if (opts.eagerFirstStripeLoad()) { // Load the first stripe on construction so that readers created in // background have a reader tree and can preload the first // stripe. Also the reader tree needs to exist in order to receive diff --git a/velox/dwio/dwrf/reader/DwrfReader.h b/velox/dwio/dwrf/reader/DwrfReader.h index a25d704aaa56..43a4dc5fae7b 100644 --- a/velox/dwio/dwrf/reader/DwrfReader.h +++ b/velox/dwio/dwrf/reader/DwrfReader.h @@ -67,11 +67,11 @@ class DwrfRowReader : public StrideIndexProvider, return columnSelector_; } - const dwio::common::RowReaderOptions& getRowReaderOptions() const { + const dwio::common::RowReaderOptions& rowReaderOptions() const { return options_; } - std::shared_ptr getSelectedType() const { + std::shared_ptr selectedType() const { if (!selectedSchema_) { selectedSchema_ = columnSelector_->buildSelected(); } @@ -79,7 +79,7 @@ class DwrfRowReader : public StrideIndexProvider, return selectedSchema_; } - uint64_t getRowNumber() const { + uint64_t rowNumber() const { return previousRow_; } @@ -87,7 +87,7 @@ class DwrfRowReader : public StrideIndexProvider, uint64_t skipRows(uint64_t numberOfRowsToSkip); - uint32_t getCurrentStripe() const { + uint32_t currentStripe() const { return currentStripe_; } @@ -95,13 +95,13 @@ class DwrfRowReader : public StrideIndexProvider, return strideIndex_; } - // Estimate the space used by the reader + /// Estimates the space used by the reader size_t estimatedReaderMemory() const; - // Estimate the row size for projected columns + /// Estimates the row size for projected columns std::optional estimatedRowSize() const override; - // Returns number of rows read. Guaranteed to be less then or equal to size. + /// Returns number of rows read. Guaranteed to be less then or equal to size. uint64_t next( uint64_t size, VectorPtr& result, @@ -139,7 +139,7 @@ class DwrfRowReader : public StrideIndexProvider, int64_t nextReadSize(uint64_t size) override; - std::shared_ptr getType() const { + std::shared_ptr type() const { if (columnSelector_) { return columnSelector_->getSchema(); } @@ -174,6 +174,12 @@ class DwrfRowReader : public StrideIndexProvider, std::unique_ptr getUnitLoader(); + const dwio::common::RowReaderOptions options_; + // column selector + const std::shared_ptr columnSelector_; + const std::function + decodingTimeCallback_; + // footer std::vector firstRowOfStripe_; mutable std::shared_ptr selectedSchema_; @@ -188,12 +194,6 @@ class DwrfRowReader : public StrideIndexProvider, uint64_t currentRowInStripe_; uint64_t rowsInCurrentStripe_; uint64_t strideIndex_; - dwio::common::RowReaderOptions options_; - std::function - decodingTimeCallback_; - - // column selector - std::shared_ptr columnSelector_; std::shared_ptr projectedNodes_; @@ -204,9 +204,9 @@ class DwrfRowReader : public StrideIndexProvider, // Number of skipped strides. int64_t skippedStrides_{0}; - // Set to true after clearing filter caches, i.e. adding a dynamic - // filter. Causes filters to be re-evaluated against stride stats on - // next stride instead of next stripe. + // Set to true after clearing filter caches, i.e. adding a dynamic filter. + // Causes filters to be re-evaluated against stride stats on next stride + // instead of next stripe. bool recomputeStridesToSkip_{false}; dwio::common::ColumnReaderStatistics columnReaderStatistics_; @@ -229,15 +229,15 @@ class DwrfReader : public dwio::common::Reader { ~DwrfReader() override = default; common::CompressionKind getCompression() const { - return readerBase_->getCompressionKind(); + return readerBase_->compressionKind(); } WriterVersion getWriterVersion() const { - return readerBase_->getWriterVersion(); + return readerBase_->writerVersion(); } const std::string& getWriterName() const { - return readerBase_->getWriterName(); + return readerBase_->writerName(); } std::vector getMetadataKeys() const; @@ -247,54 +247,54 @@ class DwrfReader : public dwio::common::Reader { bool hasMetadataValue(const std::string& key) const; uint64_t getCompressionBlockSize() const { - return readerBase_->getCompressionBlockSize(); + return readerBase_->compressionBlockSize(); } uint32_t getNumberOfStripes() const { - return readerBase_->getFooter().stripesSize(); + return readerBase_->footer().stripesSize(); } std::vector getRowsPerStripe() const { - return readerBase_->getRowsPerStripe(); + return readerBase_->rowsPerStripe(); } uint32_t strideSize() const { - return readerBase_->getFooter().rowIndexStride(); + return readerBase_->footer().rowIndexStride(); } std::unique_ptr getStripe(uint32_t) const; uint64_t getFileLength() const { - return readerBase_->getFileLength(); + return readerBase_->fileLength(); } std::unique_ptr getStatistics() const { - return readerBase_->getStatistics(); + return readerBase_->statistics(); } std::unique_ptr columnStatistics( uint32_t nodeId) const override { - return readerBase_->getColumnStatistics(nodeId); + return readerBase_->columnStatistics(nodeId); } const std::shared_ptr& rowType() const override { - return readerBase_->getSchema(); + return readerBase_->schema(); } const std::shared_ptr& typeWithId() const override { - return readerBase_->getSchemaWithId(); + return readerBase_->schemaWithId(); } const PostScript& getPostscript() const { - return readerBase_->getPostScript(); + return readerBase_->postScript(); } const FooterWrapper& getFooter() const { - return readerBase_->getFooter(); + return readerBase_->footer(); } std::optional numberOfRows() const override { - auto& fileFooter = readerBase_->getFooter(); + auto& fileFooter = readerBase_->footer(); if (fileFooter.hasNumberOfRows()) { return fileFooter.numberOfRows(); } diff --git a/velox/dwio/dwrf/reader/EncodingContext.h b/velox/dwio/dwrf/reader/EncodingContext.h index 823d1ef7b929..b7e4432b45b3 100644 --- a/velox/dwio/dwrf/reader/EncodingContext.h +++ b/velox/dwio/dwrf/reader/EncodingContext.h @@ -21,8 +21,8 @@ namespace facebook::velox::dwrf { -// Struct containing context for flatmap encoding. -// Default initialization is non-flatmap context. +/// Struct containing context for flatmap encoding. +/// Default initialization is non-flatmap context. struct FlatMapContext { public: uint32_t sequence{0}; diff --git a/velox/dwio/dwrf/reader/FlatMapColumnReader.cpp b/velox/dwio/dwrf/reader/FlatMapColumnReader.cpp index 65d0df90e2b3..d601e3416a2d 100644 --- a/velox/dwio/dwrf/reader/FlatMapColumnReader.cpp +++ b/velox/dwio/dwrf/reader/FlatMapColumnReader.cpp @@ -218,7 +218,7 @@ FlatMapColumnReader::FlatMapColumnReader( ColumnReaderFactory& factory) : ColumnReader(fileType, stripe, streamLabels, std::move(flatMapContext)), requestedType_{requestedType}, - returnFlatVector_{stripe.getRowReaderOptions().getReturnFlatVector()}, + returnFlatVector_{stripe.rowReaderOptions().returnFlatVector()}, executor_{executor} { DWIO_ENSURE_EQ(fileType_->id(), fileType->id()); @@ -605,7 +605,7 @@ std::vector>> getKeyNodesForStructEncoding( factory); const auto& mapColumnIdAsStruct = - stripe.getRowReaderOptions().getMapColumnIdAsStruct(); + stripe.rowReaderOptions().mapColumnIdAsStruct(); auto it = mapColumnIdAsStruct.find(requestedType->id()); DWIO_ENSURE(it != mapColumnIdAsStruct.end()); @@ -766,7 +766,7 @@ void FlatMapStructEncodingColumnReader::next( inline bool isRequiringStructEncoding( const std::shared_ptr& requestedType, const dwio::common::RowReaderOptions& rowOptions) { - return rowOptions.getMapColumnIdAsStruct().count(requestedType->id()) > 0; + return rowOptions.mapColumnIdAsStruct().count(requestedType->id()) > 0; } template @@ -779,7 +779,7 @@ std::unique_ptr createFlatMapColumnReader( size_t decodingParallelismFactor, FlatMapContext flatMapContext, ColumnReaderFactory& factory) { - if (isRequiringStructEncoding(requestedType, stripe.getRowReaderOptions())) { + if (isRequiringStructEncoding(requestedType, stripe.rowReaderOptions())) { return std::make_unique>( requestedType, fileType, diff --git a/velox/dwio/dwrf/reader/ReaderBase.cpp b/velox/dwio/dwrf/reader/ReaderBase.cpp index 8293922b3b82..ff2b13c9dd7c 100644 --- a/velox/dwio/dwrf/reader/ReaderBase.cpp +++ b/velox/dwio/dwrf/reader/ReaderBase.cpp @@ -35,8 +35,8 @@ using memory::MemoryPool; FooterStatisticsImpl::FooterStatisticsImpl( const ReaderBase& reader, const StatsContext& statsContext) { - auto& footer = reader.getFooter(); - auto& handler = reader.getDecryptionHandler(); + auto& footer = reader.footer(); + auto& handler = reader.decryptionHandler(); colStats_.resize(footer.statisticsSize()); // fill in the encrypted stats if (handler.isEncrypted()) { @@ -213,9 +213,9 @@ ReaderBase::ReaderBase( } } if (!cache_ && input_->shouldPrefetchStripes()) { - const auto numStripes = getFooter().stripesSize(); + const auto numStripes = footer().stripesSize(); for (auto i = 0; i < numStripes; i++) { - const auto stripe = getFooter().stripes(i); + const auto stripe = footer().stripes(i); input_->enqueue( {stripe.offset() + stripe.indexLength() + stripe.dataLength(), stripe.footerLength(), @@ -229,28 +229,28 @@ ReaderBase::ReaderBase( handler_ = DecryptionHandler::create(*footer_, decryptorFactory_.get()); } -std::vector ReaderBase::getRowsPerStripe() const { +std::vector ReaderBase::rowsPerStripe() const { std::vector rowsPerStripe; - auto numStripes = getFooter().stripesSize(); + auto numStripes = footer().stripesSize(); rowsPerStripe.reserve(numStripes); for (auto i = 0; i < numStripes; i++) { - rowsPerStripe.push_back(getFooter().stripes(i).numberOfRows()); + rowsPerStripe.push_back(footer().stripes(i).numberOfRows()); } return rowsPerStripe; } -std::unique_ptr ReaderBase::getStatistics() const { - StatsContext statsContext(getWriterName(), getWriterVersion()); +std::unique_ptr ReaderBase::statistics() const { + StatsContext statsContext(writerName(), writerVersion()); return std::make_unique(*this, statsContext); } -std::unique_ptr ReaderBase::getColumnStatistics( +std::unique_ptr ReaderBase::columnStatistics( uint32_t index) const { - DWIO_ENSURE_LT( + VELOX_CHECK_LT( index, static_cast(footer_->statisticsSize()), "column index out of range"); - StatsContext statsContext(getWriterVersion()); + StatsContext statsContext(writerVersion()); if (!handler_->isEncrypted(index)) { auto stats = footer_->statistics(index); return buildColumnStatisticsFromProto(stats, statsContext); @@ -333,25 +333,25 @@ std::shared_ptr ReaderBase::convertType( convertType( footer, type.subtypes(1), fileColumnNamesReadAsLowerCase)); case TypeKind::ROW: { - std::vector> tl; - tl.reserve(type.subtypesSize()); + std::vector> types; + types.reserve(type.subtypesSize()); std::vector names; names.reserve(type.subtypesSize()); for (int32_t i = 0; i < type.subtypesSize(); ++i) { - auto child = convertType( + auto childType = convertType( footer, type.subtypes(i), fileColumnNamesReadAsLowerCase); auto childName = type.fieldNames(i); if (fileColumnNamesReadAsLowerCase) { folly::toLowerAscii(childName); } names.push_back(std::move(childName)); - tl.push_back(std::move(child)); + types.push_back(std::move(childType)); } // NOTE: There are empty dwrf files in data warehouse that has empty // struct as the root type. So the assumption that struct has at least one // child doesn't hold. - return ROW(std::move(names), std::move(tl)); + return ROW(std::move(names), std::move(types)); } default: DWIO_RAISE("Unknown type kind"); diff --git a/velox/dwio/dwrf/reader/ReaderBase.h b/velox/dwio/dwrf/reader/ReaderBase.h index e7f8c0bd1a7f..34287d3fcd71 100644 --- a/velox/dwio/dwrf/reader/ReaderBase.h +++ b/velox/dwio/dwrf/reader/ReaderBase.h @@ -109,19 +109,19 @@ class ReaderBase { virtual ~ReaderBase() = default; - memory::MemoryPool& getMemoryPool() const { + memory::MemoryPool& memoryPool() const { return pool_; } - const PostScript& getPostScript() const { + const PostScript& postScript() const { return *postScript_; } - const FooterWrapper& getFooter() const { + const FooterWrapper& footer() const { return *footer_; } - const RowTypePtr& getSchema() const { + const RowTypePtr& schema() const { return schema_; } @@ -129,8 +129,7 @@ class ReaderBase { schema_ = std::move(newSchema); } - const std::shared_ptr& getSchemaWithId() - const { + const std::shared_ptr& schemaWithId() const { if (!schemaWithId_) { if (scanSpec_) { schemaWithId_ = dwio::common::TypeWithId::create(schema_, *scanSpec_); @@ -141,45 +140,45 @@ class ReaderBase { return schemaWithId_; } - dwio::common::BufferedInput& getBufferedInput() const { + dwio::common::BufferedInput& bufferedInput() const { return *input_; } - const std::unique_ptr& getMetadataCache() const { + const std::unique_ptr& metadataCache() const { return cache_; } - const encryption::DecryptionHandler& getDecryptionHandler() const { + const encryption::DecryptionHandler& decryptionHandler() const { return *handler_; } - uint64_t getFooterEstimatedSize() const { + uint64_t footerEstimatedSize() const { return footerEstimatedSize_; } - uint64_t getFileLength() const { + uint64_t fileLength() const { return fileLength_; } - std::vector getRowsPerStripe() const; + std::vector rowsPerStripe() const; - uint64_t getPostScriptLength() const { + uint64_t postScriptLength() const { return psLength_; } - uint64_t getCompressionBlockSize() const { + uint64_t compressionBlockSize() const { return postScript_->hasCompressionBlockSize() ? postScript_->compressionBlockSize() : common::DEFAULT_COMPRESSION_BLOCK_SIZE; } - common::CompressionKind getCompressionKind() const { + common::CompressionKind compressionKind() const { return postScript_->hasCompressionBlockSize() ? postScript_->compression() : common::CompressionKind::CompressionKind_NONE; } - WriterVersion getWriterVersion() const { + WriterVersion writerVersion() const { if (!postScript_->hasWriterVersion()) { return WriterVersion::ORIGINAL; } @@ -189,7 +188,7 @@ class ReaderBase { : WriterVersion::FUTURE; } - const std::string& getWriterName() const { + const std::string& writerName() const { for (int32_t index = 0; index < footer_->metadataSize(); ++index) { auto entry = footer_->metadata(index); if (entry.name() == WRITER_NAME_KEY) { @@ -201,9 +200,9 @@ class ReaderBase { return kEmpty; } - std::unique_ptr getStatistics() const; + std::unique_ptr statistics() const; - std::unique_ptr getColumnStatistics( + std::unique_ptr columnStatistics( uint32_t index) const; std::unique_ptr createDecompressedStream( @@ -211,9 +210,9 @@ class ReaderBase { const std::string& streamDebugInfo, const dwio::common::encryption::Decrypter* decrypter = nullptr) const { return createDecompressor( - getCompressionKind(), + compressionKind(), std::move(compressed), - getCompressionBlockSize(), + compressionBlockSize(), pool_, streamDebugInfo, decrypter); diff --git a/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h b/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h index 0d32b412de9a..c2eefd0edccb 100644 --- a/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveByteRleColumnReader.h @@ -36,7 +36,8 @@ class SelectiveByteRleColumnReader std::move(fileType), params, scanSpec) { - EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; + const EncodingKey encodingKey{ + fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); if (isBool) { boolRle_ = createBooleanRleDecoder( @@ -77,14 +78,16 @@ class SelectiveByteRleColumnReader return numValues; } - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override { + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override { readCommon(offset, rows, incomingNulls); readOffset_ += rows.back() + 1; } template - void readWithVisitor(RowSet rows, ColumnVisitor visitor); + void readWithVisitor(const RowSet& rows, ColumnVisitor visitor); std::unique_ptr byteRle_; std::unique_ptr boolRle_; @@ -92,9 +95,8 @@ class SelectiveByteRleColumnReader template void SelectiveByteRleColumnReader::readWithVisitor( - RowSet rows, + const RowSet& rows, ColumnVisitor visitor) { - vector_size_t numRows = rows.back() + 1; if (boolRle_) { if (nullsInReadRange_) { boolRle_->readWithVisitor( diff --git a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp index a334924af79b..32b73b4e9387 100644 --- a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.cpp @@ -43,7 +43,7 @@ SelectiveDecimalColumnReader::SelectiveDecimalColumnReader( scaleDecoder_ = createRleDecoder( stripe.getStream(secondary, params.streamLabels().label(), true), version_, - memoryPool_, + *memoryPool_, stripe.getUseVInts(secondary), LONG_BYTE_SIZE); } @@ -86,7 +86,7 @@ void SelectiveDecimalColumnReader::readHelper(RowSet rows) { } // copy scales into scaleBuffer_ - ensureCapacity(scaleBuffer_, numValues_, &memoryPool_); + ensureCapacity(scaleBuffer_, numValues_, memoryPool_); scaleBuffer_->setSize(numValues_ * sizeof(int64_t)); memcpy( scaleBuffer_->asMutable(), @@ -109,7 +109,7 @@ void SelectiveDecimalColumnReader::readHelper(RowSet rows) { template void SelectiveDecimalColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { VELOX_CHECK(!scanSpec_->filter()); VELOX_CHECK(!scanSpec_->valueHook()); @@ -124,7 +124,7 @@ void SelectiveDecimalColumnReader::read( template void SelectiveDecimalColumnReader::getValues( - RowSet rows, + const RowSet& rows, VectorPtr* result) { auto nullsPtr = resultNulls() ? resultNulls()->template as() : nullptr; diff --git a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h index cf0d328d4721..f27bfe350ef8 100644 --- a/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveDecimalColumnReader.h @@ -36,9 +36,10 @@ class SelectiveDecimalColumnReader : public SelectiveColumnReader { void seekToRowGroup(uint32_t index) override; uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* nulls) override; + void read(vector_size_t offset, const RowSet& rows, const uint64_t* nulls) + override; - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; private: template diff --git a/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp b/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp index 45e5e104e991..c2f05507545a 100644 --- a/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveDwrfReader.cpp @@ -39,9 +39,12 @@ std::unique_ptr buildIntegerReader( DwrfParams& params, uint32_t numBytes, common::ScanSpec& scanSpec) { - EncodingKey ek{fileType->id(), params.flatMapContext().sequence}; + const EncodingKey encodingKey{ + fileType->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); - switch (static_cast(stripe.getEncoding(ek).kind())) { + const auto encodingKind = + static_cast(stripe.getEncoding(encodingKey).kind()); + switch (encodingKind) { case proto::ColumnEncoding_Kind_DICTIONARY: case proto::ColumnEncoding_Kind_DICTIONARY_V2: return std::make_unique( @@ -51,7 +54,7 @@ std::unique_ptr buildIntegerReader( return std::make_unique( requestedType, fileType, params, numBytes, scanSpec); default: - DWIO_RAISE("buildReader unhandled integer encoding"); + VELOX_FAIL("buildReader unhandled integer encoding: {}", encodingKind); } } @@ -62,9 +65,10 @@ std::unique_ptr SelectiveDwrfReader::build( DwrfParams& params, common::ScanSpec& scanSpec, bool isRoot) { - DWIO_ENSURE( + VELOX_CHECK( !isRoot || fileType->type()->kind() == TypeKind::ROW, "The root object can only be a row."); + dwio::common::typeutils::checkTypeCompatibility( *fileType->type(), *requestedType); EncodingKey ek{fileType->id(), params.flatMapContext().sequence}; @@ -142,7 +146,7 @@ std::unique_ptr SelectiveDwrfReader::build( } [[fallthrough]]; default: - DWIO_RAISE( + VELOX_FAIL( "buildReader unhandled type: " + mapTypeKindToName(fileType->type()->kind())); } diff --git a/velox/dwio/dwrf/reader/SelectiveDwrfReader.h b/velox/dwio/dwrf/reader/SelectiveDwrfReader.h index e5e660ea0533..2f6bc6834c45 100644 --- a/velox/dwio/dwrf/reader/SelectiveDwrfReader.h +++ b/velox/dwio/dwrf/reader/SelectiveDwrfReader.h @@ -22,7 +22,7 @@ namespace facebook::velox::dwrf { -// Wrapper for static functions for making DWRF readers +/// Wrapper for static functions for making DWRF readers class SelectiveDwrfReader { public: static std::unique_ptr build( @@ -32,8 +32,8 @@ class SelectiveDwrfReader { common::ScanSpec& scanSpec, bool isRoot = false); - // Compatibility wrapper for tests. Takes the components of DwrfParams as - // separate. + /// Compatibility wrapper for tests. Takes the components of DwrfParams as + /// separate. static std::unique_ptr build( const TypePtr& requestedType, const std::shared_ptr& fileType, diff --git a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp index e5c666bb4042..35fea227cf1b 100644 --- a/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveFlatMapColumnReader.cpp @@ -210,12 +210,14 @@ class SelectiveFlatMapReader : public SelectiveStructColumnReaderBase { *this, getKeyNodes(requestedType, fileType, params, scanSpec, false)) {} - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override { + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override { flatMap_.read(offset, rows, incomingNulls); } - void getValues(RowSet rows, VectorPtr* result) override { + void getValues(const RowSet& rows, VectorPtr* result) override { flatMap_.getValues(rows, result); } diff --git a/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h b/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h index 6cce3ad3ec75..4f4b78f2c768 100644 --- a/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveFloatingPointColumnReader.h @@ -46,15 +46,17 @@ class SelectiveFloatingPointColumnReader uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override { + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override { using T = SelectiveFloatingPointColumnReader; this->template readCommon(offset, rows, incomingNulls); this->readOffset_ += rows.back() + 1; } template - void readWithVisitor(RowSet rows, TVisitor visitor); + void readWithVisitor(const RowSet& rows, TVisitor visitor); FloatingPointDecoder decoder_; }; @@ -88,7 +90,7 @@ uint64_t SelectiveFloatingPointColumnReader::skip( template template void SelectiveFloatingPointColumnReader::readWithVisitor( - RowSet rows, + const RowSet& /*rows*/, TVisitor visitor) { if (this->nullsInReadRange_) { decoder_.template readWithVisitor( diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp index 8161ae5f180f..c740e7683253 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.cpp @@ -32,21 +32,22 @@ SelectiveIntegerDictionaryColumnReader::SelectiveIntegerDictionaryColumnReader( params, scanSpec, std::move(fileType)) { - EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; + const EncodingKey encodingKey{ + fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); - auto encoding = stripe.getEncoding(encodingKey); + const auto encoding = stripe.getEncoding(encodingKey); scanState_.dictionary.numValues = encoding.dictionarysize(); rleVersion_ = convertRleVersion(encoding.kind()); - auto data = encodingKey.forKind(proto::Stream_Kind_DATA); - bool dataVInts = stripe.getUseVInts(data); - dataReader_ = createRleDecoder( - stripe.getStream(data, params.streamLabels().label(), true), + const auto si = encodingKey.forKind(proto::Stream_Kind_DATA); + const bool dataVInts = stripe.getUseVInts(si); + dataReader_ = createRleDecoder( + stripe.getStream(si, params.streamLabels().label(), true), rleVersion_, - memoryPool_, + *memoryPool_, dataVInts, numBytes); - // make a lazy dictionary initializer + // Makes a lazy dictionary initializer dictInit_ = stripe.getIntDictionaryInitializerForNode( encodingKey, numBytes, params.streamLabels(), numBytes); @@ -72,7 +73,7 @@ uint64_t SelectiveIntegerDictionaryColumnReader::skip(uint64_t numValues) { void SelectiveIntegerDictionaryColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { VELOX_WIDTH_DISPATCH( sizeOfIntKind(fileType_->type()->kind()), @@ -80,19 +81,19 @@ void SelectiveIntegerDictionaryColumnReader::read( offset, rows, incomingNulls); - auto end = rows.back() + 1; + const auto end = rows.back() + 1; const auto* rawNulls = nullsInReadRange_ ? nullsInReadRange_->as() : nullptr; - // read the stream of booleans indicating whether a given data entry - // is an offset or a literal value. + // Read the stream of booleans indicating whether a given data entry is an + // offset or a literal value. if (inDictionaryReader_) { - bool isBulk = useBulkPath(); - int32_t numFlags = (isBulk && nullsInReadRange_) + const bool isBulk = useBulkPath(); + const int32_t numFlags = (isBulk && nullsInReadRange_) ? bits::countNonNulls(nullsInReadRange_->as(), 0, end) : end; dwio::common::ensureCapacity( - scanState_.inDictionary, bits::nwords(numFlags), &memoryPool_); + scanState_.inDictionary, bits::nwords(numFlags), memoryPool_); // The in dict buffer may have changed. If no change in // dictionary, the raw state will not be updated elsewhere. scanState_.rawState.inDictionary = scanState_.inDictionary->as(); diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.h b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.h index e87a73a6dffc..9683bbe8a86e 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDictionaryColumnReader.h @@ -42,24 +42,24 @@ class SelectiveIntegerDictionaryColumnReader } dataReader_->seekToRowGroup(positionsProvider); - VELOX_CHECK(!positionsProvider.hasNext()); } uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; template - void readWithVisitor(RowSet rows, ColumnVisitor visitor); + void readWithVisitor(const RowSet& rows, ColumnVisitor visitor); private: void ensureInitialized(); std::unique_ptr inDictionaryReader_; - std::unique_ptr> dataReader_; - std::unique_ptr> dictReader_; + std::unique_ptr> dataReader_; std::function dictInit_; RleVersion rleVersion_; bool initialized_{false}; @@ -67,7 +67,7 @@ class SelectiveIntegerDictionaryColumnReader template void SelectiveIntegerDictionaryColumnReader::readWithVisitor( - RowSet rows, + const RowSet& /*rows*/, ColumnVisitor visitor) { auto dictVisitor = visitor.toDictionaryColumnVisitor(); if (rleVersion_ == RleVersion_1) { diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp index c7bd41bda3be..a434dfa5cd19 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.cpp @@ -20,13 +20,13 @@ namespace facebook::velox::dwrf { uint64_t SelectiveIntegerDirectColumnReader::skip(uint64_t numValues) { numValues = SelectiveColumnReader::skip(numValues); - ints->skip(numValues); + intDecoder_->skip(numValues); return numValues; } void SelectiveIntegerDirectColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { VELOX_WIDTH_DISPATCH( dwio::common::sizeOfIntKind(fileType_->type()->kind()), diff --git a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.h b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.h index a123ae2e8652..297ccd9afdbc 100644 --- a/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveIntegerDirectColumnReader.h @@ -38,27 +38,28 @@ class SelectiveIntegerDirectColumnReader params, scanSpec, std::move(fileType)) { - EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; - auto data = encodingKey.forKind(proto::Stream_Kind_DATA); - auto& stripe = params.stripeStreams(); - bool dataVInts = stripe.getUseVInts(data); + const EncodingKey encodingKey{ + fileType_->id(), params.flatMapContext().sequence}; + const auto si = encodingKey.forKind(proto::Stream_Kind_DATA); + const auto& stripe = params.stripeStreams(); + const bool dataVInts = stripe.getUseVInts(si); - format = stripe.format(); - if (format == velox::dwrf::DwrfFormat::kDwrf) { - ints = createDirectDecoder( - stripe.getStream(data, params.streamLabels().label(), true), + format_ = stripe.format(); + if (format_ == velox::dwrf::DwrfFormat::kDwrf) { + intDecoder_ = createDirectDecoder( + stripe.getStream(si, params.streamLabels().label(), true), dataVInts, numBytes); - } else if (format == velox::dwrf::DwrfFormat::kOrc) { - version = convertRleVersion(stripe.getEncoding(encodingKey).kind()); - ints = createRleDecoder( - stripe.getStream(data, params.streamLabels().label(), true), - version, + } else if (format_ == velox::dwrf::DwrfFormat::kOrc) { + version_ = convertRleVersion(stripe.getEncoding(encodingKey).kind()); + intDecoder_ = createRleDecoder( + stripe.getStream(si, params.streamLabels().label(), true), + version_, params.pool(), dataVInts, numBytes); } else { - VELOX_FAIL("invalid stripe format"); + VELOX_FAIL("invalid stripe format: {}", format_); } } @@ -69,31 +70,34 @@ class SelectiveIntegerDirectColumnReader void seekToRowGroup(uint32_t index) override { dwio::common::SelectiveIntegerColumnReader::seekToRowGroup(index); auto positionsProvider = formatData_->seekToRowGroup(index); - ints->seekToRowGroup(positionsProvider); + intDecoder_->seekToRowGroup(positionsProvider); VELOX_CHECK(!positionsProvider.hasNext()); } uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; template - void readWithVisitor(RowSet rows, ColumnVisitor visitor); + void readWithVisitor(const RowSet& rows, ColumnVisitor visitor); private: - dwrf::DwrfFormat format; - RleVersion version; - std::unique_ptr> ints; + dwrf::DwrfFormat format_; + RleVersion version_; + std::unique_ptr> intDecoder_; }; template void SelectiveIntegerDirectColumnReader::readWithVisitor( - RowSet rows, + const RowSet& rows, ColumnVisitor visitor) { - if (format == velox::dwrf::DwrfFormat::kDwrf) { - decodeWithVisitor>(ints.get(), visitor); + if (format_ == velox::dwrf::DwrfFormat::kDwrf) { + decodeWithVisitor>( + intDecoder_.get(), visitor); } else { // orc format does not use int128 if constexpr (!std::is_same_v) { @@ -107,13 +111,13 @@ void SelectiveIntegerDirectColumnReader::readWithVisitor( &visitor.reader(), rows, visitor.extractValues()); - if (version == velox::dwrf::RleVersion_1) { + if (version_ == velox::dwrf::RleVersion_1) { decodeWithVisitor>( - ints.get(), drVisitor); + intDecoder_.get(), drVisitor); } else { - VELOX_CHECK(version == velox::dwrf::RleVersion_2); + VELOX_CHECK(version_ == velox::dwrf::RleVersion_2); decodeWithVisitor>( - ints.get(), drVisitor); + intDecoder_.get(), drVisitor); } } else { VELOX_UNREACHABLE( diff --git a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp index a8a92a6f5758..88171321583a 100644 --- a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.cpp @@ -25,10 +25,11 @@ std::unique_ptr> makeLengthDecoder( memory::MemoryPool& pool) { EncodingKey encodingKey{fileType.id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); - auto rleVersion = convertRleVersion(stripe.getEncoding(encodingKey).kind()); - auto lenId = encodingKey.forKind(proto::Stream_Kind_LENGTH); - bool lenVints = stripe.getUseVInts(lenId); - return createRleDecoder( + const auto rleVersion = + convertRleVersion(stripe.getEncoding(encodingKey).kind()); + const auto lenId = encodingKey.forKind(proto::Stream_Kind_LENGTH); + const bool lenVints = stripe.getUseVInts(lenId); + return createRleDecoder( stripe.getStream(lenId, params.streamLabels().label(), true), rleVersion, pool, @@ -37,7 +38,7 @@ std::unique_ptr> makeLengthDecoder( } } // namespace -FlatMapContext flatMapContextFromEncodingKey(EncodingKey& encodingKey) { +FlatMapContext flatMapContextFromEncodingKey(const EncodingKey& encodingKey) { return FlatMapContext{ .sequence = encodingKey.sequence(), .inMapDecoder = nullptr, @@ -54,8 +55,8 @@ SelectiveListColumnReader::SelectiveListColumnReader( fileType, params, scanSpec), - length_(makeLengthDecoder(*fileType_, params, memoryPool_)) { - DWIO_ENSURE_EQ(fileType_->id(), fileType->id(), "working on the same node"); + length_(makeLengthDecoder(*fileType_, params, *memoryPool_)) { + VELOX_CHECK_EQ(fileType_->id(), fileType->id(), "working on the same node"); EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); // count the number of selected sub-columns @@ -86,9 +87,10 @@ SelectiveMapColumnReader::SelectiveMapColumnReader( fileType, params, scanSpec), - length_(makeLengthDecoder(*fileType_, params, memoryPool_)) { - DWIO_ENSURE_EQ(fileType_->id(), fileType->id(), "working on the same node"); - EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; + length_(makeLengthDecoder(*fileType_, params, *memoryPool_)) { + VELOX_CHECK_EQ(fileType_->id(), fileType->id(), "working on the same node"); + const EncodingKey encodingKey{ + fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); if (scanSpec_->children().empty()) { scanSpec_->getOrCreateChild(common::ScanSpec::kMapKeysFieldName); diff --git a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h index 7f31c425ce4b..d777dd71d011 100644 --- a/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveRepeatedColumnReader.h @@ -92,7 +92,7 @@ class SelectiveMapColumnReader : public dwio::common::SelectiveMapColumnReader { } private: - std::unique_ptr> length_; + std::unique_ptr> length_; }; } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.cpp index 1cf7d1771117..ecbf2f47f880 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.cpp @@ -41,7 +41,7 @@ SelectiveStringDictionaryColumnReader::SelectiveStringDictionaryColumnReader( dictIndex_ = createRleDecoder( stripe.getStream(dataId, params.streamLabels().label(), true), version_, - memoryPool_, + *memoryPool_, dictVInts, dwio::common::INT_BYTE_SIZE); @@ -50,7 +50,7 @@ SelectiveStringDictionaryColumnReader::SelectiveStringDictionaryColumnReader( lengthDecoder_ = createRleDecoder( stripe.getStream(lenId, params.streamLabels().label(), false), version_, - memoryPool_, + *memoryPool_, lenVInts, dwio::common::INT_BYTE_SIZE); @@ -73,7 +73,7 @@ SelectiveStringDictionaryColumnReader::SelectiveStringDictionaryColumnReader( encodingKey.forKind(proto::Stream_Kind_STRIDE_DICTIONARY), params.streamLabels().label(), true); - DWIO_ENSURE_NOT_NULL(strideDictStream_, "Stride dictionary is missing"); + VELOX_CHECK_NOT_NULL(strideDictStream_, "Stride dictionary is missing"); const auto strideDictLenId = encodingKey.forKind(proto::Stream_Kind_STRIDE_DICTIONARY_LENGTH); @@ -81,7 +81,7 @@ SelectiveStringDictionaryColumnReader::SelectiveStringDictionaryColumnReader( strideDictLengthDecoder_ = createRleDecoder( stripe.getStream(strideDictLenId, params.streamLabels().label(), true), version_, - memoryPool_, + *memoryPool_, strideLenVInt, dwio::common::INT_BYTE_SIZE); } @@ -103,7 +103,7 @@ void SelectiveStringDictionaryColumnReader::loadDictionary( DictionaryValues& values) { // read lengths from length reader dwio::common::ensureCapacity( - values.values, values.numValues, &memoryPool_); + values.values, values.numValues, memoryPool_); // The lengths are read in the low addresses of the string views array. auto* lengths = values.values->asMutable(); lengthDecoder.nextLengths(lengths, values.numValues); @@ -112,7 +112,7 @@ void SelectiveStringDictionaryColumnReader::loadDictionary( stringsBytes += lengths[i]; } // read bytes from underlying string - values.strings = AlignedBuffer::allocate(stringsBytes, &memoryPool_); + values.strings = AlignedBuffer::allocate(stringsBytes, memoryPool_); data.readFully(values.strings->asMutable(), stringsBytes); // fill the values with StringViews over the strings. 'strings' will // exist even if 'stringsBytes' is 0, which can happen if the only @@ -167,7 +167,7 @@ void SelectiveStringDictionaryColumnReader::makeDictionaryBaseVector() { if (scanState_.dictionary2.numValues) { BufferPtr values = AlignedBuffer::allocate( scanState_.dictionary.numValues + scanState_.dictionary2.numValues, - &memoryPool_); + memoryPool_); auto* valuesPtr = values->asMutable(); memcpy( valuesPtr, @@ -179,7 +179,7 @@ void SelectiveStringDictionaryColumnReader::makeDictionaryBaseVector() { scanState_.dictionary2.numValues * sizeof(StringView)); dictionaryValues_ = std::make_shared>( - &memoryPool_, + memoryPool_, fileType_->type(), BufferPtr(nullptr), // TODO nulls scanState_.dictionary.numValues + @@ -189,7 +189,7 @@ void SelectiveStringDictionaryColumnReader::makeDictionaryBaseVector() { scanState_.dictionary.strings, scanState_.dictionary2.strings}); } else { dictionaryValues_ = std::make_shared>( - &memoryPool_, + memoryPool_, fileType_->type(), BufferPtr(nullptr), // TODO nulls scanState_.dictionary.numValues /*length*/, @@ -200,7 +200,7 @@ void SelectiveStringDictionaryColumnReader::makeDictionaryBaseVector() { void SelectiveStringDictionaryColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); bool isDense = rows.back() == rows.size() - 1; @@ -216,7 +216,7 @@ void SelectiveStringDictionaryColumnReader::read( ? bits::countNonNulls(nullsInReadRange_->as(), 0, end) : end; dwio::common::ensureCapacity( - scanState_.inDictionary, bits::nwords(numFlags), &memoryPool_); + scanState_.inDictionary, bits::nwords(numFlags), memoryPool_); // The in dict buffer may have changed. If no change in // dictionary, the raw state will not be updated elsewhere. scanState_.rawState.inDictionary = scanState_.inDictionary->as(); @@ -266,7 +266,7 @@ void SelectiveStringDictionaryColumnReader::read( void SelectiveStringDictionaryColumnReader::makeFlat(VectorPtr* result) { auto* indices = reinterpret_cast(rawValues_); - auto values = AlignedBuffer::allocate(numValues_, &memoryPool_); + auto values = AlignedBuffer::allocate(numValues_, memoryPool_); auto* stringViews = values->asMutable(); std::vector stringBuffers; auto* stripeDict = scanState_.dictionary.values->as(); @@ -291,7 +291,7 @@ void SelectiveStringDictionaryColumnReader::makeFlat(VectorPtr* result) { } } *result = std::make_shared>( - &memoryPool_, + memoryPool_, requestedType(), std::move(nulls), numValues_, @@ -301,7 +301,7 @@ void SelectiveStringDictionaryColumnReader::makeFlat(VectorPtr* result) { } void SelectiveStringDictionaryColumnReader::getValues( - RowSet rows, + const RowSet& rows, VectorPtr* result) { compactScalarValues(rows, false); VELOX_CHECK_GT(numRowsScanned_, 0); @@ -321,7 +321,7 @@ void SelectiveStringDictionaryColumnReader::getValues( makeDictionaryBaseVector(); } *result = std::make_shared>( - &memoryPool_, resultNulls(), numValues_, dictionaryValues_, values_); + memoryPool_, resultNulls(), numValues_, dictionaryValues_, values_); } void SelectiveStringDictionaryColumnReader::ensureInitialized() { diff --git a/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.h index 640cf02937f0..3b2dddd88bfd 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStringDictionaryColumnReader.h @@ -53,26 +53,31 @@ class SelectiveStringDictionaryColumnReader uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; private: void loadStrideDictionary(); void makeDictionaryBaseVector(); template - void readWithVisitor(RowSet rows, TVisitor visitor); + void readWithVisitor(const RowSet& rows, TVisitor visitor); template - void readHelper(common::Filter* filter, RowSet rows, ExtractValues values); + void readHelper( + common::Filter* filter, + const RowSet& rows, + const ExtractValues& values); template void processFilter( common::Filter* filter, - RowSet rows, - ExtractValues extractValues); + const RowSet& rows, + const ExtractValues& extractValues); // Fills 'values' from 'data' and 'lengthDecoder'. The count of // values is in 'values.numValues'. @@ -109,7 +114,7 @@ class SelectiveStringDictionaryColumnReader template void SelectiveStringDictionaryColumnReader::readWithVisitor( - RowSet rows, + const RowSet& /*rows*/, TVisitor visitor) { if (version_ == velox::dwrf::RleVersion_1) { decodeWithVisitor>( @@ -123,8 +128,8 @@ void SelectiveStringDictionaryColumnReader::readWithVisitor( template void SelectiveStringDictionaryColumnReader::readHelper( common::Filter* filter, - RowSet rows, - ExtractValues values) { + const RowSet& rows, + const ExtractValues& values) { readWithVisitor( rows, dwio::common:: @@ -135,8 +140,8 @@ void SelectiveStringDictionaryColumnReader::readHelper( template void SelectiveStringDictionaryColumnReader::processFilter( common::Filter* filter, - RowSet rows, - ExtractValues extractValues) { + const RowSet& rows, + const ExtractValues& extractValues) { if (filter == nullptr) { readHelper( &dwio::common::alwaysTrue(), rows, extractValues); diff --git a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp index 573ebbfeda5c..8c471d92bba8 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.cpp @@ -36,7 +36,7 @@ SelectiveStringDirectColumnReader::SelectiveStringDirectColumnReader( lengthDecoder_ = createRleDecoder( stripe.getStream(lenId, params.streamLabels().label(), true), rleVersion, - memoryPool_, + *memoryPool_, lenVInts, dwio::common::INT_BYTE_SIZE); blobStream_ = stripe.getStream( @@ -47,7 +47,7 @@ SelectiveStringDirectColumnReader::SelectiveStringDirectColumnReader( uint64_t SelectiveStringDirectColumnReader::skip(uint64_t numValues) { numValues = SelectiveColumnReader::skip(numValues); - dwio::common::ensureCapacity(lengths_, numValues, &memoryPool_); + dwio::common::ensureCapacity(lengths_, numValues, memoryPool_); lengthDecoder_->nextLengths(lengths_->asMutable(), numValues); rawLengths_ = lengths_->as(); for (auto i = 0; i < numValues; ++i) { @@ -465,7 +465,7 @@ void SelectiveStringDirectColumnReader::readWithVisitor( void SelectiveStringDirectColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); auto numRows = rows.back() + 1; @@ -473,7 +473,7 @@ void SelectiveStringDirectColumnReader::read( ? BaseVector::countNulls(nullsInReadRange_, 0, numRows) : 0; dwio::common::ensureCapacity( - lengths_, numRows - numNulls, &memoryPool_); + lengths_, numRows - numNulls, memoryPool_); lengthDecoder_->nextLengths( lengths_->asMutable(), numRows - numNulls); rawLengths_ = lengths_->as(); diff --git a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h index cfa8a7350136..e8bd978ba523 100644 --- a/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStringDirectColumnReader.h @@ -44,10 +44,12 @@ class SelectiveStringDirectColumnReader uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* result) override { + void getValues(const RowSet& rows, VectorPtr* result) override { rawStringBuffer_ = nullptr; rawStringSize_ = 0; rawStringUsed_ = 0; diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp index 992f6f5b8ac7..ef957e1316e5 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.cpp @@ -37,25 +37,26 @@ SelectiveStructColumnReader::SelectiveStructColumnReader( isRoot) { EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); - auto encoding = static_cast(stripe.getEncoding(encodingKey).kind()); - DWIO_ENSURE_EQ( - encoding, - proto::ColumnEncoding_Kind_DIRECT, + const auto encodingKind = + static_cast(stripe.getEncoding(encodingKey).kind()); + VELOX_CHECK( + encodingKind == proto::ColumnEncoding_Kind_DIRECT, "Unknown encoding for StructColumnReader"); // A reader tree may be constructed while the ScanSpec is being used // for another read. This happens when the next stripe is being // prepared while the previous one is reading. auto& childSpecs = scanSpec.stableChildren(); - auto& rowType = requestedType_->asRow(); + const auto& rowType = requestedType_->asRow(); for (auto i = 0; i < childSpecs.size(); ++i) { - auto childSpec = childSpecs[i]; + auto* childSpec = childSpecs[i]; if (isChildConstant(*childSpec)) { childSpec->setSubscript(kConstantChildSpecSubscript); continue; } - auto childFileType = fileType_->childByName(childSpec->fieldName()); - auto childRequestedType = rowType.findChild(childSpec->fieldName()); + + const auto childFileType = fileType_->childByName(childSpec->fieldName()); + const auto childRequestedType = rowType.findChild(childSpec->fieldName()); auto labels = params.streamLabels().append(folly::to(i)); auto childParams = DwrfParams( stripe, diff --git a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h index 08d146cd86b2..4d55fd74bf29 100644 --- a/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveStructColumnReader.h @@ -48,11 +48,11 @@ class SelectiveStructColumnReaderBase readOffset_ = index * rowsPerRowGroup_; return; } + // There may be a nulls stream but no other streams for the struct. formatData_->seekToRowGroup(index); - // Set the read offset recursively. Do this before seeking the - // children because list/map children will reset the offsets for - // their children. + // Set the read offset recursively. Do this before seeking the children + // because list/map children will reset the offsets for their children. setReadOffsetRecursive(index * rowsPerRowGroup_); for (auto& child : children_) { child->seekToRowGroup(index); @@ -66,8 +66,8 @@ class SelectiveStructColumnReaderBase if (!reader->isTopLevel()) { return; } - auto rowGroup = reader->readOffset() / rowsPerRowGroup_; - auto nextRowGroup = offset / rowsPerRowGroup_; + const auto rowGroup = reader->readOffset() / rowsPerRowGroup_; + const auto nextRowGroup = offset / rowsPerRowGroup_; if (nextRowGroup > rowGroup) { reader->seekToRowGroup(nextRowGroup); reader->setReadOffset(nextRowGroup * rowsPerRowGroup_); @@ -78,7 +78,8 @@ class SelectiveStructColumnReaderBase const int32_t rowsPerRowGroup_; }; -struct SelectiveStructColumnReader : SelectiveStructColumnReaderBase { +class SelectiveStructColumnReader : public SelectiveStructColumnReaderBase { + public: SelectiveStructColumnReader( const TypePtr& requestedType, const std::shared_ptr& fileType, diff --git a/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.cpp b/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.cpp index 427fa6242950..689584d706f1 100644 --- a/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.cpp +++ b/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.cpp @@ -28,24 +28,24 @@ SelectiveTimestampColumnReader::SelectiveTimestampColumnReader( common::ScanSpec& scanSpec) : SelectiveColumnReader(fileType->type(), fileType, params, scanSpec), precision_( - params.stripeStreams().getRowReaderOptions().timestampPrecision()) { + params.stripeStreams().rowReaderOptions().timestampPrecision()) { EncodingKey encodingKey{fileType_->id(), params.flatMapContext().sequence}; auto& stripe = params.stripeStreams(); version_ = convertRleVersion(stripe.getEncoding(encodingKey).kind()); auto data = encodingKey.forKind(proto::Stream_Kind_DATA); bool vints = stripe.getUseVInts(data); - seconds_ = createRleDecoder( + seconds_ = createRleDecoder( stripe.getStream(data, params.streamLabels().label(), true), version_, - memoryPool_, + *memoryPool_, vints, LONG_BYTE_SIZE); auto nanoData = encodingKey.forKind(proto::Stream_Kind_NANO_DATA); bool nanoVInts = stripe.getUseVInts(nanoData); - nano_ = createRleDecoder( + nano_ = createRleDecoder( stripe.getStream(nanoData, params.streamLabels().label(), true), version_, - memoryPool_, + *memoryPool_, nanoVInts, LONG_BYTE_SIZE); } @@ -68,7 +68,7 @@ void SelectiveTimestampColumnReader::seekToRowGroup(uint32_t index) { void SelectiveTimestampColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); VELOX_CHECK( @@ -78,7 +78,7 @@ void SelectiveTimestampColumnReader::read( resultNulls_->capacity() * 8 < rows.size()) { // Make sure a dedicated resultNulls_ is allocated with enough capacity as // RleDecoder always assumes it is available. - resultNulls_ = AlignedBuffer::allocate(rows.size(), &memoryPool_); + resultNulls_ = AlignedBuffer::allocate(rows.size(), memoryPool_); rawResultNulls_ = resultNulls_->asMutable(); } bool isDense = rows.back() == rows.size() - 1; @@ -93,7 +93,7 @@ void SelectiveTimestampColumnReader::read( template void SelectiveTimestampColumnReader::readHelper( common::Filter* filter, - RowSet rows) { + const RowSet& rows) { ExtractToReader extractValues(this); common::AlwaysTrue alwaysTrue; DirectRleColumnVisitor< @@ -112,7 +112,7 @@ void SelectiveTimestampColumnReader::readHelper( // Save the seconds into their own buffer before reading nanos into // 'values_' dwio::common::ensureCapacity( - secondsValues_, numValues_, &memoryPool_); + secondsValues_, numValues_, memoryPool_); secondsValues_->setSize(numValues_ * sizeof(int64_t)); memcpy( secondsValues_->asMutable(), @@ -133,7 +133,7 @@ void SelectiveTimestampColumnReader::readHelper( const auto rawNulls = nullsInReadRange_ ? (isDense ? nullsInReadRange_->as() : rawResultNulls_) : nullptr; - auto tsValues = AlignedBuffer::allocate(numValues_, &memoryPool_); + auto tsValues = AlignedBuffer::allocate(numValues_, memoryPool_); auto rawTs = tsValues->asMutable(); for (vector_size_t i = 0; i < numValues_; i++) { @@ -196,7 +196,7 @@ void SelectiveTimestampColumnReader::readHelper( void SelectiveTimestampColumnReader::processNulls( const bool isNull, - const RowSet rows, + const RowSet& rows, const uint64_t* rawNulls) { if (!rawNulls) { return; @@ -227,7 +227,7 @@ void SelectiveTimestampColumnReader::processNulls( void SelectiveTimestampColumnReader::processFilter( const common::Filter* filter, - const RowSet rows, + const RowSet& rows, const uint64_t* rawNulls) { auto rawTs = values_->asMutable(); @@ -257,7 +257,9 @@ void SelectiveTimestampColumnReader::processFilter( } } -void SelectiveTimestampColumnReader::getValues(RowSet rows, VectorPtr* result) { +void SelectiveTimestampColumnReader::getValues( + const RowSet& rows, + VectorPtr* result) { getFlatValues(rows, result, fileType_->type(), true); } diff --git a/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.h b/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.h index ab40a3c24c5e..b3f1b8e9c845 100644 --- a/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.h +++ b/velox/dwio/dwrf/reader/SelectiveTimestampColumnReader.h @@ -35,20 +35,22 @@ class SelectiveTimestampColumnReader void seekToRowGroup(uint32_t index) override; uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; private: template - void readHelper(common::Filter* filter, RowSet rows); + void readHelper(common::Filter* filter, const RowSet& rows); void - processNulls(const bool isNull, const RowSet rows, const uint64_t* rawNulls); + processNulls(const bool isNull, const RowSet& rows, const uint64_t* rawNulls); void processFilter( const common::Filter* filter, - const RowSet rows, + const RowSet& rows, const uint64_t* rawNulls); const TimestampPrecision precision_; diff --git a/velox/dwio/dwrf/reader/StreamLabels.cpp b/velox/dwio/dwrf/reader/StreamLabels.cpp index ac6e97fcffab..cdfe65317c17 100644 --- a/velox/dwio/dwrf/reader/StreamLabels.cpp +++ b/velox/dwio/dwrf/reader/StreamLabels.cpp @@ -24,13 +24,13 @@ std::string_view StreamLabels::label() const { StreamLabels StreamLabels::append(std::string_view suffix) const { const size_t bufSize = label_.size() + suffix.size() + 1; - auto buffer = pool_.allocateFixed(bufSize); + auto* buffer = pool_.allocateFixed(bufSize); if (!label_.empty()) { - memcpy(buffer, label_.data(), label_.size()); + ::memcpy(buffer, label_.data(), label_.size()); } buffer[label_.size()] = '/'; if (!suffix.empty()) { - memcpy(buffer + label_.size() + 1, suffix.data(), suffix.size()); + ::memcpy(buffer + label_.size() + 1, suffix.data(), suffix.size()); } return StreamLabels(pool_, {buffer, bufSize}); } diff --git a/velox/dwio/dwrf/reader/StripeDictionaryCache.cpp b/velox/dwio/dwrf/reader/StripeDictionaryCache.cpp index 9f82278d8c50..16e0f4c9f5ef 100644 --- a/velox/dwio/dwrf/reader/StripeDictionaryCache.cpp +++ b/velox/dwio/dwrf/reader/StripeDictionaryCache.cpp @@ -35,14 +35,15 @@ StripeDictionaryCache::StripeDictionaryCache(velox::memory::MemoryPool* pool) // It might be more elegant to pass in a StripeStream here instead. void StripeDictionaryCache::registerIntDictionary( - const EncodingKey& ek, + const EncodingKey& encodingKey, folly::Function&& dictGen) { intDictionaryFactories_.emplace( - ek, std::make_unique(std::move(dictGen))); + encodingKey, std::make_unique(std::move(dictGen))); } -BufferPtr StripeDictionaryCache::getIntDictionary(const EncodingKey& ek) { - return intDictionaryFactories_.at(ek)->getDictionaryBuffer(pool_); +BufferPtr StripeDictionaryCache::getIntDictionary( + const EncodingKey& encodingKey) { + return intDictionaryFactories_.at(encodingKey)->getDictionaryBuffer(pool_); } } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/reader/StripeDictionaryCache.h b/velox/dwio/dwrf/reader/StripeDictionaryCache.h index 096b09128390..df5f834c12a2 100644 --- a/velox/dwio/dwrf/reader/StripeDictionaryCache.h +++ b/velox/dwio/dwrf/reader/StripeDictionaryCache.h @@ -26,9 +26,18 @@ namespace facebook::velox::dwrf { class StripeDictionaryCache { - // This could be potentially made an interface to be shared for - // string dictionaries. However, we will need a union return type - // in that case. + public: + explicit StripeDictionaryCache(velox::memory::MemoryPool* pool); + + void registerIntDictionary( + const EncodingKey& encodingKey, + folly::Function&& dictGen); + + BufferPtr getIntDictionary(const EncodingKey& encodingKey); + + private: + // This could be potentially made an interface to be shared for string + // dictionaries. However, we will need a union return type in that case. class DictionaryEntry { public: explicit DictionaryEntry( @@ -42,18 +51,8 @@ class StripeDictionaryCache { folly::once_flag onceFlag_; }; - public: - explicit StripeDictionaryCache(velox::memory::MemoryPool* pool); - - void registerIntDictionary( - const EncodingKey& ek, - folly::Function&& dictGen); - - BufferPtr getIntDictionary(const EncodingKey& ek); - - private: // This is typically the reader's memory pool. - memory::MemoryPool* pool_; + memory::MemoryPool* const pool_; std::unordered_map< EncodingKey, std::unique_ptr, diff --git a/velox/dwio/dwrf/reader/StripeMetadataCache.h b/velox/dwio/dwrf/reader/StripeMetadataCache.h index 828ced773da5..eaa9d671f986 100644 --- a/velox/dwio/dwrf/reader/StripeMetadataCache.h +++ b/velox/dwio/dwrf/reader/StripeMetadataCache.h @@ -80,7 +80,7 @@ class StripeMetadataCache { private: uint64_t getIndex(StripeCacheMode mode, uint64_t stripeIndex) const { if (mode_ & mode) { - uint64_t index = + const uint64_t index = (mode_ == mode ? stripeIndex : stripeIndex * 2 + mode - StripeCacheMode::INDEX); // offsets has N + 1 items, so length[N] = offset[N+1]- offset[N] diff --git a/velox/dwio/dwrf/reader/StripeReaderBase.cpp b/velox/dwio/dwrf/reader/StripeReaderBase.cpp index f7c8c9727c25..611318d46886 100644 --- a/velox/dwio/dwrf/reader/StripeReaderBase.cpp +++ b/velox/dwio/dwrf/reader/StripeReaderBase.cpp @@ -26,25 +26,25 @@ using dwio::common::LogType; std::unique_ptr StripeReaderBase::fetchStripe( uint32_t index, bool& preload) const { - auto& fileFooter = reader_->getFooter(); + auto& fileFooter = reader_->footer(); VELOX_CHECK_LT(index, fileFooter.stripesSize(), "invalid stripe index"); auto stripe = fileFooter.stripes(index); - auto& cache = reader_->getMetadataCache(); + auto& cache = reader_->metadataCache(); uint64_t offset = stripe.offset(); uint64_t length = stripe.indexLength() + stripe.dataLength() + stripe.footerLength(); - std::unique_ptr prefetchedStripe; - if (reader_->getBufferedInput().isBuffered(offset, length)) { + std::unique_ptr stripeInput; + if (reader_->bufferedInput().isBuffered(offset, length)) { preload = true; - prefetchedStripe = nullptr; + stripeInput = nullptr; } else { - prefetchedStripe = reader_->getBufferedInput().clone(); + stripeInput = reader_->bufferedInput().clone(); if (preload) { // If metadata cache exists, adjust read position to avoid re-reading // metadata sections - if (cache) { + if (cache != nullptr) { if (cache->has(StripeCacheMode::INDEX, index)) { offset += stripe.indexLength(); length -= stripe.indexLength(); @@ -54,68 +54,68 @@ std::unique_ptr StripeReaderBase::fetchStripe( } } - prefetchedStripe->enqueue({offset, length, "stripe"}); - prefetchedStripe->load(LogType::STRIPE); + stripeInput->enqueue({offset, length, "stripe"}); + stripeInput->load(LogType::STRIPE); } } // load stripe footer - std::unique_ptr stream; + std::unique_ptr footerStream; if (cache) { - stream = cache->get(StripeCacheMode::FOOTER, index); + footerStream = cache->get(StripeCacheMode::FOOTER, index); } - if (!stream) { - dwio::common::BufferedInput& bi = - prefetchedStripe ? *prefetchedStripe : reader_->getBufferedInput(); - stream = bi.read( + if (!footerStream) { + dwio::common::BufferedInput& bufferInput = + (stripeInput != nullptr) ? *stripeInput : reader_->bufferedInput(); + footerStream = bufferInput.read( stripe.offset() + stripe.indexLength() + stripe.dataLength(), stripe.footerLength(), LogType::STRIPE_FOOTER); } - auto streamDebugInfo = fmt::format("Stripe {} Footer ", index); + const auto streamDebugInfo = fmt::format("Stripe {} Footer ", index); auto arena = std::make_shared(); auto* rawFooter = google::protobuf::Arena::CreateMessage(arena.get()); ProtoUtils::readProtoInto( - reader_->createDecompressedStream(std::move(stream), streamDebugInfo), + reader_->createDecompressedStream( + std::move(footerStream), streamDebugInfo), rawFooter); std::shared_ptr stripeFooter( rawFooter, [arena = std::move(arena)](auto*) { arena->Reset(); }); - auto handler = std::make_unique( - reader_->getDecryptionHandler()); + auto decryptionHandler = std::make_unique( + reader_->decryptionHandler()); // refresh stripe encryption key if necessary - loadEncryptionKeys(index, *stripeFooter, *handler, stripe); - - return prefetchedStripe == nullptr ? std::make_unique( - &reader_->getBufferedInput(), - std::move(stripeFooter), - std::move(handler), - std::move(stripe)) - : std::make_unique( - std::move(prefetchedStripe), - std::move(stripeFooter), - std::move(handler), - std::move(stripe)); + loadEncryptionKeys(index, *stripeFooter, stripe, *decryptionHandler); + return stripeInput == nullptr ? std::make_unique( + &reader_->bufferedInput(), + std::move(stripeFooter), + std::move(decryptionHandler), + std::move(stripe)) + : std::make_unique( + std::move(stripeInput), + std::move(stripeFooter), + std::move(decryptionHandler), + std::move(stripe)); } void StripeReaderBase::loadEncryptionKeys( uint32_t index, const proto::StripeFooter& stripeFooter, - encryption::DecryptionHandler& handler, - const StripeInformationWrapper& stripeInfo) const { + const StripeInformationWrapper& stripeInfo, + encryption::DecryptionHandler& handler) const { if (!handler.isEncrypted()) { return; } - DWIO_ENSURE_EQ( + VELOX_CHECK_EQ( stripeFooter.encryptiongroups_size(), handler.getEncryptionGroupCount()); - auto& fileFooter = reader_->getFooter(); - DWIO_ENSURE_LT(index, fileFooter.stripesSize(), "invalid stripe index"); + const auto& fileFooter = reader_->footer(); + VELOX_CHECK_LT(index, fileFooter.stripesSize(), "invalid stripe index"); // If current stripe has keys, load these keys. if (stripeInfo.keyMetadataSize() > 0) { @@ -131,12 +131,13 @@ void StripeReaderBase::loadEncryptionKeys( // // But, since we are enabling parallel loads now, let's not rely on // sequential order. Let's apply (2). - DWIO_ENSURE_GT(index, 0, "first stripe must have key"); + VELOX_CHECK_GT(index, 0, "first stripe must have key"); + uint32_t prevIndex = index - 1; while (true) { - auto prev = fileFooter.stripes(prevIndex); - if (prev.keyMetadataSize() > 0) { - handler.setKeys(prev.keyMetadata()); + const auto prevStripeInfo = fileFooter.stripes(prevIndex); + if (prevStripeInfo.keyMetadataSize() > 0) { + handler.setKeys(prevStripeInfo.keyMetadata()); break; } --prevIndex; diff --git a/velox/dwio/dwrf/reader/StripeReaderBase.h b/velox/dwio/dwrf/reader/StripeReaderBase.h index 7f7ee43bc6ec..6ca5d6fd7319 100644 --- a/velox/dwio/dwrf/reader/StripeReaderBase.h +++ b/velox/dwio/dwrf/reader/StripeReaderBase.h @@ -26,32 +26,32 @@ namespace facebook::velox::dwrf { struct StripeMetadata { dwio::common::BufferedInput* stripeInput; std::shared_ptr footer; - std::unique_ptr handler; + std::unique_ptr decryptionHandler; StripeInformationWrapper stripeInfo; StripeMetadata( - std::shared_ptr stripeInput, - std::shared_ptr footer, - std::unique_ptr handler, - StripeInformationWrapper stripeInfo) - : stripeInput{stripeInput.get()}, - footer{std::move(footer)}, - handler{std::move(handler)}, - stripeInfo{std::move(stripeInfo)}, - stripeInputOwned{std::move(stripeInput)} {} + std::shared_ptr _stripeInput, + std::shared_ptr _footer, + std::unique_ptr _decryptionHandler, + StripeInformationWrapper _stripeInfo) + : stripeInput{_stripeInput.get()}, + footer{std::move(_footer)}, + decryptionHandler{std::move(_decryptionHandler)}, + stripeInfo{std::move(_stripeInfo)}, + stripeInputOwned_{std::move(_stripeInput)} {} StripeMetadata( - dwio::common::BufferedInput* stripeInput, - std::shared_ptr footer, - std::unique_ptr handler, - StripeInformationWrapper stripeInfo) - : stripeInput{stripeInput}, - footer{std::move(footer)}, - handler{std::move(handler)}, - stripeInfo{std::move(stripeInfo)} {} + dwio::common::BufferedInput* _stripeInput, + std::shared_ptr _footer, + std::unique_ptr _decryptionHandler, + StripeInformationWrapper _stripeInfo) + : stripeInput{_stripeInput}, + footer{std::move(_footer)}, + decryptionHandler{std::move(_decryptionHandler)}, + stripeInfo{std::move(_stripeInfo)} {} private: - std::shared_ptr stripeInputOwned; + const std::shared_ptr stripeInputOwned_; }; class StripeReaderBase { @@ -74,14 +74,14 @@ class StripeReaderBase { bool& preload) const; private: - const std::shared_ptr reader_; - // stripeFooter default null arg should only be used for testing. void loadEncryptionKeys( uint32_t index, const proto::StripeFooter& stripeFooter, - encryption::DecryptionHandler& handler, - const StripeInformationWrapper& stripeInfo) const; + const StripeInformationWrapper& stripeInfo, + encryption::DecryptionHandler& handler) const; + + const std::shared_ptr reader_; friend class StripeLoadKeysTest; }; diff --git a/velox/dwio/dwrf/reader/StripeStream.cpp b/velox/dwio/dwrf/reader/StripeStream.cpp index a29529292e19..0ad5d7fc5125 100644 --- a/velox/dwio/dwrf/reader/StripeStream.cpp +++ b/velox/dwio/dwrf/reader/StripeStream.cpp @@ -104,48 +104,49 @@ BufferPtr readDict( std::function StripeStreamsBase::getIntDictionaryInitializerForNode( - const EncodingKey& ek, + const EncodingKey& encodingKey, uint64_t elementWidth, const StreamLabels& streamLabels, uint64_t dictionaryWidth) { // Create local copy for manipulation - EncodingKey localEk{ek}; - auto dictData = localEk.forKind(proto::Stream_Kind_DICTIONARY_DATA); - auto dataStream = getStream(dictData, streamLabels.label(), false); - auto dictionarySize = getEncoding(localEk).dictionarysize(); + EncodingKey dictEncodingKey{encodingKey}; + auto dictDataSi = dictEncodingKey.forKind(proto::Stream_Kind_DICTIONARY_DATA); + auto dictDataStream = getStream(dictDataSi, streamLabels.label(), false); + const auto dictionarySize = getEncoding(dictEncodingKey).dictionarysize(); // Try fetching shared dictionary streams instead. - if (!dataStream) { + if (!dictDataStream) { // Get the label of the top level column, since this dictionary is shared by - // the entire column + // the entire column. auto label = streamLabels.label(); // Shouldn't be empty, but just in case if (!label.empty()) { // Ex: "/5/1759392083" -> "/5" label = label.substr(0, label.find('/', 1)); } - localEk = EncodingKey(ek.node(), 0); - dictData = localEk.forKind(proto::Stream_Kind_DICTIONARY_DATA); - dataStream = getStream(dictData, label, false); + dictEncodingKey = EncodingKey(encodingKey.node(), 0); + dictDataSi = dictEncodingKey.forKind(proto::Stream_Kind_DICTIONARY_DATA); + dictDataStream = getStream(dictDataSi, label, false); } - bool dictVInts = getUseVInts(dictData); - DWIO_ENSURE(dataStream.get()); + + const bool dictVInts = getUseVInts(dictDataSi); + VELOX_CHECK_NOT_NULL(dictDataStream); stripeDictionaryCache_->registerIntDictionary( - localEk, + dictEncodingKey, [dictReader = createDirectDecoder( - std::move(dataStream), dictVInts, elementWidth), + std::move(dictDataStream), dictVInts, elementWidth), dictionaryWidth, dictionarySize](velox::memory::MemoryPool* pool) mutable { return VELOX_WIDTH_DISPATCH( dictionaryWidth, readDict, dictReader.get(), dictionarySize, pool); }); - return [&dictCache = *stripeDictionaryCache_, localEk]() { + return [&dictCache = *stripeDictionaryCache_, dictEncodingKey]() { // If this is not flat map or if dictionary is not shared, return as is - return dictCache.getIntDictionary(localEk); + return dictCache.getIntDictionary(dictEncodingKey); }; } void StripeStreamsImpl::loadStreams() { - auto& stripeFooter = *readState_->stripeMetadata->footer; + const auto& stripeFooter = *readState_->stripeMetadata->footer; if (selector_) { // HACK!!! @@ -158,14 +159,14 @@ void StripeStreamsImpl::loadStreams() { VELOX_CHECK_NULL(projectedNodes_); projectedNodes_ = std::make_shared(0); auto expected = selector_->getSchemaWithId(); - auto actual = readState_->readerBase->getSchemaWithId(); + auto actual = readState_->readerBase->schemaWithId(); findProjectedNodes( *projectedNodes_, *expected, *actual, [&](uint32_t node) { return selector_->shouldReadNode(node); }); } - auto addStream = [&](auto& stream, auto& offset) { + const auto addStream = [&](auto& stream, auto& offset) { if (stream.has_offset()) { offset = stream.offset(); } @@ -175,25 +176,26 @@ void StripeStreamsImpl::loadStreams() { offset += stream.length(); }; - uint64_t streamOffset = 0; + uint64_t streamOffset{0}; for (auto& stream : stripeFooter.streams()) { addStream(stream, streamOffset); } // update column encoding for each stream for (uint32_t i = 0; i < stripeFooter.encoding_size(); ++i) { - auto& e = stripeFooter.encoding(i); - auto node = e.has_node() ? e.node() : i; + const auto& e = stripeFooter.encoding(i); + const auto node = e.has_node() ? e.node() : i; if (projectedNodes_->contains(node)) { encodings_[{node, e.has_sequence() ? e.sequence() : 0}] = i; } } // handle encrypted columns - auto& handler = *readState_->stripeMetadata->handler; - if (handler.isEncrypted()) { - DWIO_ENSURE_EQ( - handler.getEncryptionGroupCount(), + const auto& decryptionHandler = + *readState_->stripeMetadata->decryptionHandler; + if (decryptionHandler.isEncrypted()) { + VELOX_CHECK_EQ( + decryptionHandler.getEncryptionGroupCount(), stripeFooter.encryptiongroups_size()); folly::F14FastSet groupIndices; bits::forEachSetBit( @@ -201,26 +203,28 @@ void StripeStreamsImpl::loadStreams() { 0, projectedNodes_->max() + 1, [&](uint32_t node) { - if (handler.isEncrypted(node)) { - groupIndices.insert(handler.getEncryptionGroupIndex(node)); + if (decryptionHandler.isEncrypted(node)) { + groupIndices.insert( + decryptionHandler.getEncryptionGroupIndex(node)); } }); // decrypt encryption groups for (auto index : groupIndices) { - auto& group = stripeFooter.encryptiongroups(index); - auto groupProto = + const auto& group = stripeFooter.encryptiongroups(index); + const auto groupProto = readState_->readerBase ->readProtoFromString( group, - std::addressof(handler.getEncryptionProviderByIndex(index))); + std::addressof( + decryptionHandler.getEncryptionProviderByIndex(index))); streamOffset = 0; for (auto& stream : groupProto->streams()) { addStream(stream, streamOffset); } for (auto& encoding : groupProto->encoding()) { - DWIO_ENSURE(encoding.has_node(), "node is required"); - auto node = encoding.node(); + VELOX_CHECK(encoding.has_node(), "node is required"); + const auto node = encoding.node(); if (projectedNodes_->contains(node)) { decryptedEncodings_[{ node, encoding.has_sequence() ? encoding.sequence() : 0}] = @@ -237,23 +241,23 @@ StripeStreamsImpl::getCompressedStream( std::string_view label) const { const auto& info = getStreamInfo(si); - std::unique_ptr streamRead; + std::unique_ptr streamInput; if (isIndexStream(si.kind())) { - streamRead = getIndexStreamFromCache(info); + streamInput = getIndexStreamFromCache(info); } - if (!streamRead) { - streamRead = readState_->stripeMetadata->stripeInput->enqueue( + if (!streamInput) { + streamInput = readState_->stripeMetadata->stripeInput->enqueue( {info.getOffset() + stripeStart_, info.getLength(), label}, &si); } - DWIO_ENSURE(streamRead != nullptr, " Stream can't be read", si.toString()); - return streamRead; + VELOX_CHECK_NOT_NULL(streamInput, " Stream can't be read", si.toString()); + return streamInput; } folly::F14FastMap> StripeStreamsImpl::getEncodingKeys() const { - DWIO_ENSURE_EQ( + VELOX_CHECK_EQ( decryptedEncodings_.size(), 0, "Not supported for reader with encryption"); @@ -272,41 +276,41 @@ StripeStreamsImpl::getStreamIdentifiers() const { folly::F14FastMap> nodeToStreamIdMap; - for (const auto& kv : streams_) { - nodeToStreamIdMap[kv.first.encodingKey().node()].push_back(kv.first); + for (const auto& item : streams_) { + nodeToStreamIdMap[item.first.encodingKey().node()].push_back(item.first); } - return nodeToStreamIdMap; } std::unique_ptr StripeStreamsImpl::getStream( const DwrfStreamIdentifier& si, std::string_view label, - bool /* throwIfNotFound*/) const { - // if not found, return an empty {} - const auto& info = getStreamInfo(si, false /* throwIfNotFound */); - if (!info.valid()) { // Stream not found. + bool /*throwIfNotFound*/) const { + // If not found, return an empty {} + const auto& info = getStreamInfo(si, /*throwIfNotFound=*/false); + if (!info.valid()) { + // Stream not found. return {}; } - std::unique_ptr streamRead; + std::unique_ptr streamInput; if (isIndexStream(si.kind())) { - streamRead = getIndexStreamFromCache(info); + streamInput = getIndexStreamFromCache(info); } - if (!streamRead) { - streamRead = readState_->stripeMetadata->stripeInput->enqueue( + if (!streamInput) { + streamInput = readState_->stripeMetadata->stripeInput->enqueue( {info.getOffset() + stripeStart_, info.getLength(), label}, &si); } - if (!streamRead) { - return streamRead; + if (!streamInput) { + return streamInput; } - auto streamDebugInfo = + const auto streamDebugInfo = fmt::format("Stripe {} Stream {}", stripeIndex_, si.toString()); return readState_->readerBase->createDecompressedStream( - std::move(streamRead), + std::move(streamInput), streamDebugInfo, getDecrypter(si.encodingKey().node())); } @@ -321,7 +325,6 @@ uint32_t StripeStreamsImpl::visitStreamsOfNode( ++count; } } - return count; } @@ -337,40 +340,43 @@ bool StripeStreamsImpl::getUseVInts(const DwrfStreamIdentifier& si) const { std::unique_ptr StripeStreamsImpl::getIndexStreamFromCache( const StreamInformation& info) const { - std::unique_ptr indexStream; - auto& metadataCache = readState_->readerBase->getMetadataCache(); - if (metadataCache) { - auto indexBase = metadataCache->get(StripeCacheMode::INDEX, stripeIndex_); - if (indexBase) { - auto offset = info.getOffset(); - auto length = info.getLength(); - if (auto* cacheInput = - dynamic_cast(indexBase.get())) { - cacheInput->Skip(offset); - cacheInput->setRemainingBytes(length); - return indexBase; - } - const void* start; - { - int32_t ignored; - const bool ret = indexBase->Next(&start, &ignored); - VELOX_CHECK(ret, "Failed to read index"); - } - indexStream = std::make_unique( - static_cast(start) + offset, length); - } + auto& metadataCache = readState_->readerBase->metadataCache(); + if (!metadataCache) { + return nullptr; + } + + auto indexBase = metadataCache->get(StripeCacheMode::INDEX, stripeIndex_); + if (!indexBase) { + return nullptr; + } + + const auto offset = info.getOffset(); + const auto length = info.getLength(); + if (auto* cacheInput = + dynamic_cast(indexBase.get())) { + cacheInput->Skip(offset); + cacheInput->setRemainingBytes(length); + return indexBase; + } + + const void* start; + { + int32_t ignored; + const bool ret = indexBase->Next(&start, &ignored); + VELOX_CHECK(ret, "Failed to read index"); } - return indexStream; + return std::make_unique( + static_cast(start) + offset, length); } void StripeStreamsImpl::loadReadPlan() { - DWIO_ENSURE_EQ(readPlanLoaded_, false, "only load read plan once!"); + VELOX_CHECK(!readPlanLoaded_, "only load read plan once!"); SCOPE_EXIT { readPlanLoaded_ = true; }; - auto& input = *readState_->stripeMetadata->stripeInput; - input.load(LogType::STREAM_BUNDLE); + auto* input = readState_->stripeMetadata->stripeInput; + input->load(LogType::STREAM_BUNDLE); } } // namespace facebook::velox::dwrf diff --git a/velox/dwio/dwrf/reader/StripeStream.h b/velox/dwio/dwrf/reader/StripeStream.h index 5e0c93d20196..362a0b43d097 100644 --- a/velox/dwio/dwrf/reader/StripeStream.h +++ b/velox/dwio/dwrf/reader/StripeStream.h @@ -37,12 +37,6 @@ class StrideIndexProvider { * StreamInformation Implementation */ class StreamInformationImpl : public StreamInformation { - private: - DwrfStreamIdentifier streamId_; - uint64_t offset_; - uint64_t length_; - bool useVInts_; - public: static const StreamInformationImpl& getNotFound() { static const StreamInformationImpl NOT_FOUND; @@ -54,9 +48,7 @@ class StreamInformationImpl : public StreamInformation { : streamId_(stream), offset_(offset), length_(stream.length()), - useVInts_(stream.usevints()) { - // PASS - } + useVInts_(stream.usevints()) {} ~StreamInformationImpl() override = default; @@ -87,6 +79,12 @@ class StreamInformationImpl : public StreamInformation { bool valid() const override { return streamId_.encodingKey().valid(); } + + private: + DwrfStreamIdentifier streamId_; + uint64_t offset_; + uint64_t length_; + bool useVInts_; }; class StripeStreams { @@ -106,7 +104,7 @@ class StripeStreams { virtual const dwio::common::ColumnSelector& getColumnSelector() const = 0; // Get row reader options - virtual const dwio::common::RowReaderOptions& getRowReaderOptions() const = 0; + virtual const dwio::common::RowReaderOptions& rowReaderOptions() const = 0; /** * Get the encoding for the given column for this stripe. @@ -125,7 +123,7 @@ class StripeStreams { std::string_view label, bool throwIfNotFound) const = 0; - /// Get the integer dictionary data for the given node and sequence. + /// Gets the integer dictionary data for the given node and sequence. /// /// 'elementWidth' is the width of the data type of the column. /// 'dictionaryWidth' is *the width at which this is stored in the reader. @@ -199,8 +197,8 @@ class StripeStreamsBase : public StripeStreams { } protected: - memory::MemoryPool* pool_; - std::shared_ptr stripeDictionaryCache_; + memory::MemoryPool* const pool_; + const std::shared_ptr stripeDictionaryCache_; }; struct StripeReadState { @@ -208,10 +206,10 @@ struct StripeReadState { std::unique_ptr stripeMetadata; StripeReadState( - std::shared_ptr readerBase, - std::unique_ptr stripeMetadata) - : readerBase{std::move(readerBase)}, - stripeMetadata{std::move(stripeMetadata)} {} + std::shared_ptr _readerBase, + std::unique_ptr _stripeMetadata) + : readerBase{std::move(_readerBase)}, + stripeMetadata{std::move(_stripeMetadata)} {} }; /** @@ -230,7 +228,7 @@ class StripeStreamsImpl : public StripeStreamsBase { int64_t stripeNumberOfRows, const StrideIndexProvider& provider, uint32_t stripeIndex) - : StripeStreamsBase{&readState->readerBase->getMemoryPool()}, + : StripeStreamsBase{&readState->readerBase->memoryPool()}, readState_(std::move(readState)), selector_{selector}, opts_{opts}, @@ -238,8 +236,7 @@ class StripeStreamsImpl : public StripeStreamsBase { stripeStart_{stripeStart}, stripeNumberOfRows_{stripeNumberOfRows}, provider_(provider), - stripeIndex_{stripeIndex}, - readPlanLoaded_{false} { + stripeIndex_{stripeIndex} { loadStreams(); } @@ -253,22 +250,22 @@ class StripeStreamsImpl : public StripeStreamsBase { return *selector_; } - const dwio::common::RowReaderOptions& getRowReaderOptions() const override { + const dwio::common::RowReaderOptions& rowReaderOptions() const override { return opts_; } const proto::ColumnEncoding& getEncoding( - const EncodingKey& ek) const override { - auto index = encodings_.find(ek); + const EncodingKey& encodingKey) const override { + auto index = encodings_.find(encodingKey); if (index != encodings_.end()) { return readState_->stripeMetadata->footer->encoding(index->second); } - auto enc = decryptedEncodings_.find(ek); - DWIO_ENSURE( - enc != decryptedEncodings_.end(), + auto encodingKeyIt = decryptedEncodings_.find(encodingKey); + VELOX_CHECK( + encodingKeyIt != decryptedEncodings_.end(), "encoding not found: ", - ek.toString()); - return enc->second; + encodingKey.toString()); + return encodingKeyIt->second; } // load data into buffer according to read plan @@ -312,20 +309,20 @@ class StripeStreamsImpl : public StripeStreamsBase { } uint32_t rowsPerRowGroup() const override { - return readState_->readerBase->getFooter().rowIndexStride(); + return readState_->readerBase->footer().rowIndexStride(); } private: const StreamInformation& getStreamInfo( const DwrfStreamIdentifier& si, const bool throwIfNotFound = true) const { - auto index = streams_.find(si); - if (index == streams_.end()) { + const auto it = streams_.find(si); + if (it == streams_.end()) { VELOX_CHECK(!throwIfNotFound, "stream info not found: ", si.toString()); return StreamInformationImpl::getNotFound(); } - return index->second; + return it->second; } std::unique_ptr getIndexStreamFromCache( @@ -333,7 +330,7 @@ class StripeStreamsImpl : public StripeStreamsBase { const dwio::common::encryption::Decrypter* getDecrypter( uint32_t nodeId) const { - auto& handler = *readState_->stripeMetadata->handler; + auto& handler = *readState_->stripeMetadata->decryptionHandler; return handler.isEncrypted(nodeId) ? std::addressof(handler.getEncryptionProvider(nodeId)) : nullptr; @@ -352,7 +349,7 @@ class StripeStreamsImpl : public StripeStreamsBase { const StrideIndexProvider& provider_; const uint32_t stripeIndex_; - bool readPlanLoaded_; + bool readPlanLoaded_{false}; // map of stream id -> stream information folly::F14FastMap< diff --git a/velox/dwio/dwrf/test/ColumnWriterStatsTests.cpp b/velox/dwio/dwrf/test/ColumnWriterStatsTests.cpp index 46a82d36bc79..f0d66c6f699b 100644 --- a/velox/dwio/dwrf/test/ColumnWriterStatsTests.cpp +++ b/velox/dwio/dwrf/test/ColumnWriterStatsTests.cpp @@ -51,24 +51,24 @@ void verifyStats( const size_t repeat, const std::vector& nodeSizePerStride, const bool hasFlatMapCol) { - ASSERT_EQ(1, rowReader.getReader().getFooter().stripesSize()) + ASSERT_EQ(1, rowReader.getReader().footer().stripesSize()) << "Only one stripe expected"; - ASSERT_EQ(true, rowReader.getReader().getFooter().hasRawDataSize()) + ASSERT_EQ(true, rowReader.getReader().footer().hasRawDataSize()) << "File raw data size does not exist"; ASSERT_EQ( nodeSizePerStride.at(0) * repeat, - rowReader.getReader().getFooter().rawDataSize()) + rowReader.getReader().footer().rawDataSize()) << "File raw data size does not match"; // Verify File Column's raw Size. for (auto nodeId = 0; nodeId < nodeSizePerStride.size(); nodeId++) { ASSERT_EQ( nodeSizePerStride.at(nodeId) * repeat, - rowReader.getReader().getColumnStatistics(nodeId)->getRawSize()) + rowReader.getReader().columnStatistics(nodeId)->getRawSize()) << "RawSize does not match. Node " << nodeId << " " - << rowReader.getReader().getColumnStatistics(nodeId)->toString(); + << rowReader.getReader().columnStatistics(nodeId)->toString(); } bool preload = true; @@ -77,7 +77,7 @@ void verifyStats( // Verify Stripe content length + index length equals size of the column 0. auto totalStreamSize = stripeInfo.dataLength() + stripeInfo.indexLength(); - auto node_0_Size = rowReader.getReader().getColumnStatistics(0)->getSize(); + auto node_0_Size = rowReader.getReader().columnStatistics(0)->getSize(); ASSERT_EQ(node_0_Size, totalStreamSize) << "Total size does not match"; @@ -89,15 +89,15 @@ void verifyStats( } computeCumulativeNodeSize( - nodeSizes, *TypeWithId::create(rowReader.getReader().getSchema())); + nodeSizes, *TypeWithId::create(rowReader.getReader().schema())); for (auto nodeId = 0; - nodeId < rowReader.getReader().getFooter().statisticsSize(); + nodeId < rowReader.getReader().footer().statisticsSize(); nodeId++) { ASSERT_EQ( nodeSizes[nodeId], - rowReader.getReader().getColumnStatistics(nodeId)->getSize().value()) + rowReader.getReader().columnStatistics(nodeId)->getSize().value()) << "Size does not match. Node " << nodeId << " " - << rowReader.getReader().getColumnStatistics(nodeId)->toString(); + << rowReader.getReader().columnStatistics(nodeId)->toString(); } // Verify Stride Stats. @@ -106,7 +106,7 @@ void verifyStats( rowReader.readerBaseShared(), std::move(stripeMetadata)), &rowReader.getColumnSelector(), nullptr, - rowReader.getRowReaderOptions(), + rowReader.rowReaderOptions(), stripeInfo.offset(), static_cast(stripeInfo.numberOfRows()), rowReader, diff --git a/velox/dwio/dwrf/test/ColumnWriterTest.cpp b/velox/dwio/dwrf/test/ColumnWriterTest.cpp index 3d453c39c4d8..798d60461042 100644 --- a/velox/dwio/dwrf/test/ColumnWriterTest.cpp +++ b/velox/dwio/dwrf/test/ColumnWriterTest.cpp @@ -20,6 +20,7 @@ #include #include #include +#include "velox/common/base/tests/GTestUtils.h" #include "velox/common/memory/Memory.h" #include "velox/dwio/common/IntDecoder.h" #include "velox/dwio/common/TypeWithId.h" @@ -153,7 +154,7 @@ class TestStripeStreams : public StripeStreamsBase { return selector_; } - const RowReaderOptions& getRowReaderOptions() const override { + const RowReaderOptions& rowReaderOptions() const override { return options_; } @@ -358,9 +359,9 @@ void testDataTypeWriter( } // Reader API requires the caller to read the Stripe for number of // values and iterate only until that number. - // It does not support hasNext/next protocol. + // It does notd support hasNext/next protocol. // Use a bigger number like 50, as some values may be bit packed. - EXPECT_THROW({ reader->next(50, out); }, exception::LoggedException); + VELOX_ASSERT_THROW(reader->next(50, out), ""); context.nextStripe(); writer->reset(); diff --git a/velox/dwio/dwrf/test/E2EWriterTest.cpp b/velox/dwio/dwrf/test/E2EWriterTest.cpp index 03be102f5a9e..dd2cc39f01d3 100644 --- a/velox/dwio/dwrf/test/E2EWriterTest.cpp +++ b/velox/dwio/dwrf/test/E2EWriterTest.cpp @@ -1146,7 +1146,7 @@ class E2EEncryptionTest : public E2EWriterTest { const DecryptionHandler& getDecryptionHandler( const ::facebook::velox::dwrf::DwrfReader& reader) const { - return reader.testingReaderBase()->getDecryptionHandler(); + return reader.testingReaderBase()->decryptionHandler(); } void validateFileContent( diff --git a/velox/dwio/dwrf/test/OrcTest.h b/velox/dwio/dwrf/test/OrcTest.h index 3c21baf42343..942ad269242a 100644 --- a/velox/dwio/dwrf/test/OrcTest.h +++ b/velox/dwio/dwrf/test/OrcTest.h @@ -108,7 +108,7 @@ class MockStripeStreams : public StripeStreams { return *getColumnSelectorProxy(); } - const dwio::common::RowReaderOptions& getRowReaderOptions() const override { + const dwio::common::RowReaderOptions& rowReaderOptions() const override { auto ptr = getRowReaderOptionsProxy(); return ptr ? *ptr : options_; } diff --git a/velox/dwio/dwrf/test/ReaderBaseTests.cpp b/velox/dwio/dwrf/test/ReaderBaseTests.cpp index eb808d7577b8..17bb111780a6 100644 --- a/velox/dwio/dwrf/test/ReaderBaseTests.cpp +++ b/velox/dwio/dwrf/test/ReaderBaseTests.cpp @@ -127,8 +127,8 @@ class EncryptedStatsTest : public Test { std::shared_ptr readerPool_; }; -TEST_F(EncryptedStatsTest, getStatistics) { - auto stats = reader_->getStatistics(); +TEST_F(EncryptedStatsTest, statistics) { + auto stats = reader_->statistics(); for (size_t i = 1; i < 7; ++i) { auto& stat = stats->getColumnStatistics(i); if (i != 5) { @@ -141,7 +141,7 @@ TEST_F(EncryptedStatsTest, getStatistics) { TEST_F(EncryptedStatsTest, getStatisticsKeyNotLoaded) { clearKey(0); - auto stats = reader_->getStatistics(); + auto stats = reader_->statistics(); for (size_t i = 2; i < 7; ++i) { auto& stat = stats->getColumnStatistics(i); if (i != 5) { @@ -154,7 +154,7 @@ TEST_F(EncryptedStatsTest, getStatisticsKeyNotLoaded) { TEST_F(EncryptedStatsTest, getColumnStatistics) { for (size_t i = 1; i < 7; ++i) { - auto stats = reader_->getColumnStatistics(i); + auto stats = reader_->columnStatistics(i); if (i != 5) { ASSERT_EQ(stats->getNumberOfValues(), i * 100); } else { @@ -166,7 +166,7 @@ TEST_F(EncryptedStatsTest, getColumnStatistics) { TEST_F(EncryptedStatsTest, getColumnStatisticsKeyNotLoaded) { clearKey(0); for (size_t i = 2; i < 7; ++i) { - auto stats = reader_->getColumnStatistics(i); + auto stats = reader_->columnStatistics(i); if (i != 5) { ASSERT_EQ(stats->getNumberOfValues(), i * 100); } else { diff --git a/velox/dwio/dwrf/test/StripeReaderBaseTests.cpp b/velox/dwio/dwrf/test/StripeReaderBaseTests.cpp index d675564caa6c..1ed91ed929fa 100644 --- a/velox/dwio/dwrf/test/StripeReaderBaseTests.cpp +++ b/velox/dwio/dwrf/test/StripeReaderBaseTests.cpp @@ -87,10 +87,10 @@ class StripeLoadKeysTest : public Test { FooterWrapper(footer_.get()).stripes(index)); auto handler = std::make_unique( - reader_->getDecryptionHandler()); + reader_->decryptionHandler()); stripeReader_->loadEncryptionKeys( - index, *stripeFooter_, *handler, *stripeInfo_); + index, *stripeFooter_, *stripeInfo_, *handler); handler_ = std::move(handler); diff --git a/velox/dwio/dwrf/test/TestStripeStream.cpp b/velox/dwio/dwrf/test/TestStripeStream.cpp index e3181816702f..58dd3bab49ea 100644 --- a/velox/dwio/dwrf/test/TestStripeStream.cpp +++ b/velox/dwio/dwrf/test/TestStripeStream.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "velox/common/base/tests/GTestUtils.h" #include "velox/dwio/common/encryption/TestProvider.h" #include "velox/dwio/dwrf/reader/StripeStream.h" #include "velox/dwio/dwrf/test/OrcTest.h" @@ -66,7 +67,7 @@ void enqueueReads( const ColumnSelector& selector, uint64_t stripeStart, uint32_t stripeIndex) { - auto& metadataCache = readerBase.getMetadataCache(); + auto& metadataCache = readerBase.metadataCache(); uint64_t offset = stripeStart; uint64_t length = 0; for (const auto& stream : footer.streams()) { @@ -139,7 +140,7 @@ TEST_F(StripeStreamTest, planReads) { std::make_unique(proto::PostScript{}), footer, nullptr); - ColumnSelector cs{readerBase->getSchema(), std::vector{2}, true}; + ColumnSelector cs{readerBase->schema(), std::vector{2}, true}; auto stripeFooter = std::make_unique(); std::vector> ss{ std::make_tuple(1, StreamKind::StreamKind_ROW_INDEX, 100), @@ -157,7 +158,7 @@ TEST_F(StripeStreamTest, planReads) { TestDecrypterFactory factory; auto handler = DecryptionHandler::create(FooterWrapper(footer), &factory); auto stripeMetadata = std::make_unique( - readerBase->getBufferedInput().clone(), + readerBase->bufferedInput().clone(), std::move(stripeFooter), std::move(handler), StripeInformationWrapper( @@ -194,7 +195,7 @@ TEST_F(StripeStreamTest, filterSequences) { nullptr); // mock a filter that we only need one node and one sequence - ColumnSelector cs{readerBase->getSchema(), std::vector{"a#[1]"}}; + ColumnSelector cs{readerBase->schema(), std::vector{"a#[1]"}}; const auto& node = cs.getNode(1); auto seqFilter = std::make_shared>(); seqFilter->insert(1); @@ -225,7 +226,7 @@ TEST_F(StripeStreamTest, filterSequences) { // filter by sequence 1 std::vector expected{{600, 5000000}, {8000600, 1000000}}; auto stripeMetadata = std::make_unique( - readerBase->getBufferedInput().clone(), + readerBase->bufferedInput().clone(), std::move(stripeFooter), std::move(handler), StripeInformationWrapper( @@ -275,7 +276,7 @@ TEST_F(StripeStreamTest, zeroLength) { TestDecrypterFactory factory; auto handler = DecryptionHandler::create(FooterWrapper(footer), &factory); auto stripeMetadata = std::make_unique( - readerBase->getBufferedInput().clone(), + readerBase->bufferedInput().clone(), std::move(stripeFooter), std::move(handler), StripeInformationWrapper( @@ -368,7 +369,7 @@ TEST_F(StripeStreamTest, planReadsIndex) { TestDecrypterFactory factory; auto handler = DecryptionHandler::create(FooterWrapper(footer), &factory); auto stripeMetadata = std::make_unique( - readerBase->getBufferedInput().clone(), + readerBase->bufferedInput().clone(), std::move(stripeFooter), std::move(handler), StripeInformationWrapper( @@ -486,7 +487,7 @@ TEST_F(StripeStreamTest, readEncryptedStreams) { nullptr, std::move(handler)); auto stripeMetadata = std::make_unique( - &readerBase->getBufferedInput(), + &readerBase->bufferedInput(), std::move(stripeFooter), DecryptionHandler::create(FooterWrapper(footer), &factory), StripeInformationWrapper( @@ -494,7 +495,7 @@ TEST_F(StripeStreamTest, readEncryptedStreams) { auto stripeReadState = std::make_shared(readerBase, std::move(stripeMetadata)); StripeReaderBase stripeReader{readerBase}; - ColumnSelector selector{readerBase->getSchema(), {1, 2, 4}, true}; + ColumnSelector selector{readerBase->schema(), {1, 2, 4}, true}; TestProvider provider; StripeStreamsImpl streams{ stripeReadState, @@ -518,7 +519,7 @@ TEST_F(StripeStreamTest, readEncryptedStreams) { ASSERT_EQ(streams.getEncoding(ek).dictionarysize(), node + 1); ASSERT_NE(stream, nullptr); } else { - ASSERT_THROW(streams.getEncoding(ek), exception::LoggedException); + VELOX_ASSERT_THROW(streams.getEncoding(ek), "encoding not found"); ASSERT_EQ(stream, nullptr); } } @@ -569,7 +570,7 @@ TEST_F(StripeStreamTest, schemaMismatch) { nullptr, std::move(handler)); auto stripeMetadata = std::make_unique( - &readerBase->getBufferedInput(), + &readerBase->bufferedInput(), std::move(stripeFooter), DecryptionHandler::create(FooterWrapper(footer), &factory), StripeInformationWrapper( @@ -634,7 +635,7 @@ class TestStripeStreams : public StripeStreamsBase { VELOX_UNSUPPORTED(); } - const facebook::velox::dwio::common::RowReaderOptions& getRowReaderOptions() + const facebook::velox::dwio::common::RowReaderOptions& rowReaderOptions() const override { VELOX_UNSUPPORTED(); } diff --git a/velox/dwio/dwrf/test/WriterTest.cpp b/velox/dwio/dwrf/test/WriterTest.cpp index aef39623c221..3a8fee8e6d6b 100644 --- a/velox/dwio/dwrf/test/WriterTest.cpp +++ b/velox/dwio/dwrf/test/WriterTest.cpp @@ -186,10 +186,8 @@ TEST_P(AllWriterCompressionTest, compression) { // deserialize and verify auto reader = createReader(); - - auto& ps = reader->getPostScript(); ASSERT_EQ( - reader->getCompressionKind(), + reader->compressionKind(), // Verify the compression type is the same as passed-in. // If compression is not passed in (CompressionKind::CompressionKind_MAX), // verify compressionKind the compression type is the same as config. @@ -238,16 +236,16 @@ TEST_P(SupportedCompressionTest, WriteFooter) { // deserialize and verify auto reader = createReader(); - auto& ps = reader->getPostScript(); - ASSERT_EQ(reader->getWriterVersion(), config->get(Config::WRITER_VERSION)); - ASSERT_EQ(reader->getCompressionKind(), config->get(Config::COMPRESSION)); + auto& ps = reader->postScript(); + ASSERT_EQ(reader->writerVersion(), config->get(Config::WRITER_VERSION)); + ASSERT_EQ(reader->compressionKind(), config->get(Config::COMPRESSION)); ASSERT_EQ( - reader->getCompressionBlockSize(), + reader->compressionBlockSize(), config->get(Config::COMPRESSION_BLOCK_SIZE)); ASSERT_EQ(ps.cacheSize(), (10 + 10) * 3); ASSERT_EQ(ps.cacheMode(), config->get(Config::STRIPE_CACHE_MODE)); - auto& footer = reader->getFooter(); + auto& footer = reader->footer(); ASSERT_TRUE(footer.hasHeaderLength()); ASSERT_EQ(footer.headerLength(), ORC_MAGIC_LEN); ASSERT_TRUE(footer.hasContentLength()); @@ -265,8 +263,7 @@ TEST_P(SupportedCompressionTest, WriteFooter) { if (item.name() == WRITER_NAME_KEY) { ASSERT_EQ(item.value(), kDwioWriter); } else if (item.name() == WRITER_VERSION_KEY) { - ASSERT_EQ( - item.value(), folly::to(reader->getWriterVersion())); + ASSERT_EQ(item.value(), folly::to(reader->writerVersion())); } else if (item.name() == WRITER_HOSTNAME_KEY) { ASSERT_EQ(item.value(), process::getHostName()); } else { @@ -285,7 +282,7 @@ TEST_P(SupportedCompressionTest, WriteFooter) { footer.checksumAlgorithm(), config->get(Config::CHECKSUM_ALGORITHM)); ASSERT_THAT( footer.stripeCacheOffsets(), ElementsAre(0, 10, 20, 30, 40, 50, 60)); - auto& cache = reader->getMetadataCache(); + auto& cache = reader->metadataCache(); for (size_t i = 0; i < 3; ++i) { ASSERT_TRUE(cache->has(StripeCacheMode::INDEX, i)); ASSERT_TRUE(cache->has(StripeCacheMode::FOOTER, i)); @@ -341,7 +338,7 @@ TEST_P(SupportedCompressionTest, NoChecksum) { // deserialize and verify auto reader = createReader(); - auto& footer = reader->getFooter(); + auto& footer = reader->footer(); ASSERT_TRUE(footer.hasChecksumAlgorithm()); ASSERT_EQ(footer.checksumAlgorithm(), proto::ChecksumAlgorithm::NULL_); } @@ -376,12 +373,12 @@ TEST_P(SupportedCompressionTest, NoCache) { // deserialize and verify auto reader = createReader(); - auto& footer = reader->getFooter(); + auto& footer = reader->footer(); ASSERT_EQ(footer.stripeCacheOffsetsSize(), 0); - auto& ps = reader->getPostScript(); + auto& ps = reader->postScript(); ASSERT_EQ(ps.cacheMode(), StripeCacheMode::NA); ASSERT_EQ(ps.cacheSize(), 0); - ASSERT_EQ(reader->getMetadataCache(), nullptr); + ASSERT_EQ(reader->metadataCache(), nullptr); } TEST_P(SupportedCompressionTest, ValidateStreamSizeConfigDisabled) { diff --git a/velox/dwio/parquet/reader/BooleanColumnReader.h b/velox/dwio/parquet/reader/BooleanColumnReader.h index 465ec5adb6f9..1c2cd43a4706 100644 --- a/velox/dwio/parquet/reader/BooleanColumnReader.h +++ b/velox/dwio/parquet/reader/BooleanColumnReader.h @@ -47,14 +47,16 @@ class BooleanColumnReader : public dwio::common::SelectiveByteRleColumnReader { return numValues; } - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override { + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override { readCommon(offset, rows, incomingNulls); readOffset_ += rows.back() + 1; } template - void readWithVisitor(RowSet rows, ColumnVisitor visitor) { + void readWithVisitor(const RowSet& /*rows*/, ColumnVisitor visitor) { formatData_->as().readWithVisitor(visitor); } }; diff --git a/velox/dwio/parquet/reader/FloatingPointColumnReader.h b/velox/dwio/parquet/reader/FloatingPointColumnReader.h index ed91e67a739f..66487e1da938 100644 --- a/velox/dwio/parquet/reader/FloatingPointColumnReader.h +++ b/velox/dwio/parquet/reader/FloatingPointColumnReader.h @@ -45,15 +45,17 @@ class FloatingPointColumnReader uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override { + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override { using T = FloatingPointColumnReader; this->template readCommon(offset, rows, incomingNulls); this->readOffset_ += rows.back() + 1; } template - void readWithVisitor(RowSet rows, TVisitor visitor); + void readWithVisitor(const RowSet& rows, TVisitor visitor); }; template @@ -77,7 +79,7 @@ uint64_t FloatingPointColumnReader::skip( template template void FloatingPointColumnReader::readWithVisitor( - RowSet rows, + const RowSet& rows, TVisitor visitor) { this->formatData_->template as().readWithVisitor(visitor); } diff --git a/velox/dwio/parquet/reader/IntegerColumnReader.h b/velox/dwio/parquet/reader/IntegerColumnReader.h index 5c0689902a0a..4568e9ba2d44 100644 --- a/velox/dwio/parquet/reader/IntegerColumnReader.h +++ b/velox/dwio/parquet/reader/IntegerColumnReader.h @@ -53,7 +53,7 @@ class IntegerColumnReader : public dwio::common::SelectiveIntegerColumnReader { return numValues; } - void getValues(RowSet rows, VectorPtr* result) override { + void getValues(const RowSet& rows, VectorPtr* result) override { auto& fileType = static_cast(*fileType_); auto logicalType = fileType.logicalType_; if (logicalType.has_value() && logicalType.value().__isset.INTEGER && @@ -66,7 +66,7 @@ class IntegerColumnReader : public dwio::common::SelectiveIntegerColumnReader { void read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* /*incomingNulls*/) override { auto& data = formatData_->as(); VELOX_WIDTH_DISPATCH( @@ -80,7 +80,7 @@ class IntegerColumnReader : public dwio::common::SelectiveIntegerColumnReader { } template - void readWithVisitor(RowSet rows, ColumnVisitor visitor) { + void readWithVisitor(const RowSet& rows, ColumnVisitor visitor) { formatData_->as().readWithVisitor(visitor); } }; diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index e36e305fe943..81b794dc86d2 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -853,9 +853,9 @@ class ParquetRowReader::Impl { requestedType_, readerBase_->schemaWithId(), // Id is schema id params, - *options_.getScanSpec()); + *options_.scanSpec()); columnReader_->setFillMutatedOutputRows( - options_.getRowNumberColumnInfo().has_value()); + options_.rowNumberColumnInfo().has_value()); filterRowGroups(); if (!rowGroupIds_.empty()) { @@ -872,7 +872,7 @@ class ParquetRowReader::Impl { ParquetData::FilterRowGroupsResult res; columnReader_->filterRowGroups(0, ParquetStatsContext(), res); - if (auto& metadataFilter = options_.getMetadataFilter()) { + if (auto& metadataFilter = options_.metadataFilter()) { metadataFilter->eval(res.metadataFilterResults, res.filterResult); } @@ -886,8 +886,7 @@ class ParquetRowReader::Impl { : rowGroups_[i].columns[0].meta_data.data_page_offset; VELOX_CHECK_GT(fileOffset, 0); auto rowGroupInRange = - (fileOffset >= options_.getOffset() && - fileOffset < options_.getLimit()); + (fileOffset >= options_.offset() && fileOffset < options_.limit()); auto isExcluded = (i < res.totalCount && bits::isBitSet(res.filterResult.data(), i)); @@ -928,7 +927,7 @@ class ParquetRowReader::Impl { return 0; } VELOX_DCHECK_GT(rowsToRead, 0); - if (!options_.getRowNumberColumnInfo().has_value()) { + if (!options_.rowNumberColumnInfo().has_value()) { columnReader_->next(rowsToRead, result, mutation); } else { readWithRowNumber( diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp index f7a5f0fa832b..f076aec8a5c5 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.cpp +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.cpp @@ -164,10 +164,10 @@ void MapColumnReader::setLengthsFromRepDefs(PageReader& pageReader) { auto repDefRange = pageReader.repDefRange(); int32_t numRepDefs = repDefRange.second - repDefRange.first; BufferPtr lengths = std::move(lengths_.lengths()); - dwio::common::ensureCapacity(lengths, numRepDefs, &memoryPool_); + dwio::common::ensureCapacity(lengths, numRepDefs, memoryPool_); memset(lengths->asMutable(), 0, lengths->size()); dwio::common::ensureCapacity( - nullsInReadRange_, bits::nwords(numRepDefs), &memoryPool_); + nullsInReadRange_, bits::nwords(numRepDefs), memoryPool_); auto numLists = pageReader.getLengthsAndNulls( LevelMode::kList, levelInfo_, @@ -184,7 +184,7 @@ void MapColumnReader::setLengthsFromRepDefs(PageReader& pageReader) { void MapColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { // The topmost list reader reads the repdefs for the left subtree. ensureRepDefs(*this, offset + rows.back() + 1 - readOffset_); @@ -268,10 +268,10 @@ void ListColumnReader::setLengthsFromRepDefs(PageReader& pageReader) { auto repDefRange = pageReader.repDefRange(); int32_t numRepDefs = repDefRange.second - repDefRange.first; BufferPtr lengths = std::move(lengths_.lengths()); - dwio::common::ensureCapacity(lengths, numRepDefs, &memoryPool_); + dwio::common::ensureCapacity(lengths, numRepDefs, memoryPool_); memset(lengths->asMutable(), 0, lengths->size()); dwio::common::ensureCapacity( - nullsInReadRange_, bits::nwords(numRepDefs), &memoryPool_); + nullsInReadRange_, bits::nwords(numRepDefs), memoryPool_); auto numLists = pageReader.getLengthsAndNulls( LevelMode::kList, levelInfo_, @@ -287,7 +287,8 @@ void ListColumnReader::setLengthsFromRepDefs(PageReader& pageReader) { } void ListColumnReader::read( vector_size_t offset, - RowSet rows, + + const RowSet& rows, const uint64_t* incomingNulls) { // The topmost list reader reads the repdefs for the left subtree. ensureRepDefs(*this, offset + rows.back() + 1 - readOffset_); diff --git a/velox/dwio/parquet/reader/RepeatedColumnReader.h b/velox/dwio/parquet/reader/RepeatedColumnReader.h index 317b374b79ee..6aef4ff05239 100644 --- a/velox/dwio/parquet/reader/RepeatedColumnReader.h +++ b/velox/dwio/parquet/reader/RepeatedColumnReader.h @@ -74,7 +74,7 @@ class MapColumnReader : public dwio::common::SelectiveMapColumnReader { void read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* /*incomingNulls*/) override; void setLengths(BufferPtr lengths) { @@ -130,7 +130,7 @@ class ListColumnReader : public dwio::common::SelectiveListColumnReader { void read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* /*incomingNulls*/) override; void setLengths(BufferPtr lengths) { diff --git a/velox/dwio/parquet/reader/RleBpDataDecoder.h b/velox/dwio/parquet/reader/RleBpDataDecoder.h index 71ef471cce4f..9ab89a54500d 100644 --- a/velox/dwio/parquet/reader/RleBpDataDecoder.h +++ b/velox/dwio/parquet/reader/RleBpDataDecoder.h @@ -282,7 +282,7 @@ class RleBpDataDecoder : public facebook::velox::parquet::RleBpDecoder { if (ptr < lastSafeWord) { return *reinterpret_cast(ptr) >> bitOffset; } - int32_t byteWidth = bits::roundUp(bitOffset + bitWidth, 8) / 8; + const int32_t byteWidth = bits::divRoundUp(bitOffset + bitWidth, 8); return bits::loadPartialWord( reinterpret_cast(ptr), byteWidth) >> bitOffset; diff --git a/velox/dwio/parquet/reader/RleBpDecoder.h b/velox/dwio/parquet/reader/RleBpDecoder.h index 2b9bbe58f864..ac07ed76ad0c 100644 --- a/velox/dwio/parquet/reader/RleBpDecoder.h +++ b/velox/dwio/parquet/reader/RleBpDecoder.h @@ -28,7 +28,7 @@ class RleBpDecoder { : bufferStart_(start), bufferEnd_(end), bitWidth_(bitWidth), - byteWidth_(bits::roundUp(bitWidth, 8) / 8), + byteWidth_(bits::divRoundUp(bitWidth, 8)), bitMask_(bits::lowMask(bitWidth)), lastSafeWord_(end - sizeof(uint64_t)) {} diff --git a/velox/dwio/parquet/reader/StringColumnReader.cpp b/velox/dwio/parquet/reader/StringColumnReader.cpp index 2dd0250159c3..634fe1db61d0 100644 --- a/velox/dwio/parquet/reader/StringColumnReader.cpp +++ b/velox/dwio/parquet/reader/StringColumnReader.cpp @@ -33,7 +33,7 @@ uint64_t StringColumnReader::skip(uint64_t numValues) { void StringColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* incomingNulls) { prepareRead(offset, rows, incomingNulls); dwio::common::StringColumnReadWithVisitorHelper( @@ -43,14 +43,14 @@ void StringColumnReader::read( readOffset_ += rows.back() + 1; } -void StringColumnReader::getValues(RowSet rows, VectorPtr* result) { +void StringColumnReader::getValues(const RowSet& rows, VectorPtr* result) { if (scanState_.dictionary.values) { auto dictionaryValues = formatData_->as().dictionaryValues(fileType_->type()); compactScalarValues(rows, false); *result = std::make_shared>( - &memoryPool_, resultNulls(), numValues_, dictionaryValues, values_); + memoryPool_, resultNulls(), numValues_, dictionaryValues, values_); return; } rawStringBuffer_ = nullptr; diff --git a/velox/dwio/parquet/reader/StringColumnReader.h b/velox/dwio/parquet/reader/StringColumnReader.h index e9bedc2365fc..8c3a9a2a6b0f 100644 --- a/velox/dwio/parquet/reader/StringColumnReader.h +++ b/velox/dwio/parquet/reader/StringColumnReader.h @@ -43,10 +43,12 @@ class StringColumnReader : public dwio::common::SelectiveColumnReader { uint64_t skip(uint64_t numValues) override; - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; - void getValues(RowSet rows, VectorPtr* result) override; + void getValues(const RowSet& rows, VectorPtr* result) override; void dedictionarize() override; }; diff --git a/velox/dwio/parquet/reader/StructColumnReader.cpp b/velox/dwio/parquet/reader/StructColumnReader.cpp index cbff1d0716dc..66d339b8732d 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.cpp +++ b/velox/dwio/parquet/reader/StructColumnReader.cpp @@ -98,7 +98,7 @@ StructColumnReader::findBestLeaf() { void StructColumnReader::read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* /*incomingNulls*/) { ensureRepDefs(*this, offset + rows.back() + 1 - readOffset_); SelectiveStructColumnReader::read(offset, rows, nullptr); @@ -176,7 +176,7 @@ void StructColumnReader::setNullsFromRepDefs(PageReader& pageReader) { auto repDefRange = pageReader.repDefRange(); int32_t numRepDefs = repDefRange.second - repDefRange.first; dwio::common::ensureCapacity( - nullsInReadRange_, bits::nwords(numRepDefs), &memoryPool_); + nullsInReadRange_, bits::nwords(numRepDefs), memoryPool_); auto numStructs = pageReader.getLengthsAndNulls( levelMode_, levelInfo_, diff --git a/velox/dwio/parquet/reader/StructColumnReader.h b/velox/dwio/parquet/reader/StructColumnReader.h index 198d2cac0f5f..d9c41e3849d8 100644 --- a/velox/dwio/parquet/reader/StructColumnReader.h +++ b/velox/dwio/parquet/reader/StructColumnReader.h @@ -37,8 +37,10 @@ class StructColumnReader : public dwio::common::SelectiveStructColumnReader { ParquetParams& params, common::ScanSpec& scanSpec); - void read(vector_size_t offset, RowSet rows, const uint64_t* incomingNulls) - override; + void read( + vector_size_t offset, + const RowSet& rows, + const uint64_t* incomingNulls) override; void seekToRowGroup(uint32_t index) override; diff --git a/velox/dwio/parquet/reader/TimestampColumnReader.h b/velox/dwio/parquet/reader/TimestampColumnReader.h index 11eb00e24286..d7de779f83a1 100644 --- a/velox/dwio/parquet/reader/TimestampColumnReader.h +++ b/velox/dwio/parquet/reader/TimestampColumnReader.h @@ -35,7 +35,7 @@ class TimestampColumnReader : public IntegerColumnReader { return false; } - void getValues(RowSet rows, VectorPtr* result) override { + void getValues(const RowSet& rows, VectorPtr* result) override { getFlatValues(rows, result, requestedType_); if (allNull_) { return; @@ -67,7 +67,7 @@ class TimestampColumnReader : public IntegerColumnReader { void read( vector_size_t offset, - RowSet rows, + const RowSet& rows, const uint64_t* /*incomingNulls*/) override { auto& data = formatData_->as(); // Use int128_t as a workaroud. Timestamp in Velox is of 16-byte length. diff --git a/velox/exec/OperatorUtils.cpp b/velox/exec/OperatorUtils.cpp index b7336f39160a..140eded2bf2d 100644 --- a/velox/exec/OperatorUtils.cpp +++ b/velox/exec/OperatorUtils.cpp @@ -178,7 +178,7 @@ namespace { vector_size_t processConstantFilterResults( const VectorPtr& filterResult, const SelectivityVector& rows) { - auto constant = filterResult->as>(); + const auto constant = filterResult->as>(); if (constant->isNullAt(0) || constant->valueAt(0) == false) { return 0; } @@ -190,15 +190,15 @@ vector_size_t processFlatFilterResults( const SelectivityVector& rows, FilterEvalCtx& filterEvalCtx, memory::MemoryPool* pool) { - auto size = rows.size(); + const auto size = rows.size(); - auto selectedBits = filterEvalCtx.getRawSelectedBits(size, pool); - auto nonNullBits = + auto* selectedBits = filterEvalCtx.getRawSelectedBits(size, pool); + auto* nonNullBits = filterResult->as>()->rawValues(); if (filterResult->mayHaveNulls()) { bits::andBits(selectedBits, nonNullBits, filterResult->rawNulls(), 0, size); } else { - memcpy(selectedBits, nonNullBits, bits::nbytes(size)); + ::memcpy(selectedBits, nonNullBits, bits::nbytes(size)); } if (!rows.isAllSelected()) { bits::andBits(selectedBits, rows.allBits(), 0, size); diff --git a/velox/type/Subfield.h b/velox/type/Subfield.h index 72cb8cd8e520..9fc7f343b214 100644 --- a/velox/type/Subfield.h +++ b/velox/type/Subfield.h @@ -219,7 +219,6 @@ class Subfield { const std::string index_; }; - public: // Separators: the customized separators to tokenize field name. explicit Subfield( const std::string& path, diff --git a/velox/vector/BaseVector.cpp b/velox/vector/BaseVector.cpp index d6dcbbbb9411..2ae0d69212ff 100644 --- a/velox/vector/BaseVector.cpp +++ b/velox/vector/BaseVector.cpp @@ -70,9 +70,9 @@ BaseVector::BaseVector( void BaseVector::ensureNullsCapacity( vector_size_t minimumSize, bool setNotNull) { - auto fill = setNotNull ? bits::kNotNull : bits::kNull; + const auto fill = setNotNull ? bits::kNotNull : bits::kNull; // Ensure the size of nulls_ is always at least as large as length_. - auto size = std::max(minimumSize, length_); + const auto size = std::max(minimumSize, length_); if (nulls_ && !nulls_->isView() && nulls_->unique()) { if (nulls_->capacity() < bits::nbytes(size)) { AlignedBuffer::reallocate(&nulls_, size, fill); @@ -89,7 +89,7 @@ void BaseVector::ensureNullsCapacity( } else { auto newNulls = AlignedBuffer::allocate(size, pool_, fill); if (nulls_) { - memcpy( + ::memcpy( newNulls->asMutable(), nulls_->as(), byteSize(std::min(length_, size))); @@ -106,7 +106,7 @@ uint64_t BaseVector::byteSize(vector_size_t count) { void BaseVector::resize(vector_size_t size, bool setNotNull) { if (nulls_) { - auto bytes = byteSize(size); + const auto bytes = byteSize(size); if (length_ < size || nulls_->isView()) { ensureNullsCapacity(size, setNotNull); } @@ -193,7 +193,7 @@ static VectorPtr addConstant( bool copyBase) { using T = typename KindToFlatVector::WrapperType; - auto pool = vector->pool(); + auto* pool = vector->pool(); if (vector->isNullAt(index)) { if constexpr (std::is_same_v) { @@ -246,7 +246,7 @@ VectorPtr BaseVector::wrapInConstant( vector_size_t index, VectorPtr vector, bool copyBase) { - auto kind = vector->typeKind(); + const auto kind = vector->typeKind(); return VELOX_DYNAMIC_TYPE_DISPATCH_ALL( addConstant, kind, length, index, std::move(vector), copyBase); } @@ -311,7 +311,7 @@ VectorPtr BaseVector::createInternal( case TypeKind::ARRAY: { BufferPtr sizes = allocateSizes(size, pool); BufferPtr offsets = allocateOffsets(size, pool); - auto elementType = type->as().elementType(); + const auto& elementType = type->as().elementType(); auto elements = create(elementType, 0, pool); return std::make_shared( pool, @@ -325,8 +325,8 @@ VectorPtr BaseVector::createInternal( case TypeKind::MAP: { BufferPtr sizes = allocateSizes(size, pool); BufferPtr offsets = allocateOffsets(size, pool); - auto keyType = type->as().keyType(); - auto valueType = type->as().valueType(); + const auto& keyType = type->as().keyType(); + const auto& valueType = type->as().valueType(); auto keys = create(keyType, 0, pool); auto values = create(valueType, 0, pool); return std::make_shared( @@ -448,8 +448,8 @@ void BaseVector::clearNulls(vector_size_t begin, vector_size_t end) { return; } - auto rawNulls = nulls_->asMutable(); - bits::fillBits(rawNulls, begin, end, true); + auto* rawNulls = nulls_->asMutable(); + bits::fillBits(rawNulls, begin, end, bits::kNotNull); nullCount_ = std::nullopt; } @@ -793,6 +793,7 @@ bool isLazyNotLoaded(const BaseVector& vector) { // deeper. return isLazyNotLoaded(*vector.asUnchecked()->loadedVector()); case VectorEncoding::Simple::DICTIONARY: + [[fallthrough]]; case VectorEncoding::Simple::SEQUENCE: return isLazyNotLoaded(*vector.valueVector()); case VectorEncoding::Simple::CONSTANT: @@ -890,7 +891,7 @@ void BaseVector::reuseNulls() { // there is at least one null bit set. Reset otherwise. if (nulls_) { if (nulls_->isMutable()) { - if (0 == BaseVector::countNulls(nulls_, length_)) { + if (BaseVector::countNulls(nulls_, length_) == 0) { nulls_ = nullptr; rawNulls_ = nullptr; } diff --git a/velox/vector/BaseVector.h b/velox/vector/BaseVector.h index 1339222d8e46..74478e2f7030 100644 --- a/velox/vector/BaseVector.h +++ b/velox/vector/BaseVector.h @@ -806,9 +806,7 @@ class BaseVector { } void clearContainingLazyAndWrapped() { - if (containsLazyAndIsWrapped_) { - containsLazyAndIsWrapped_ = false; - } + containsLazyAndIsWrapped_ = false; } bool memoDisabled() const { diff --git a/velox/vector/LazyVector.cpp b/velox/vector/LazyVector.cpp index b48173fcaed1..b836faeecc95 100644 --- a/velox/vector/LazyVector.cpp +++ b/velox/vector/LazyVector.cpp @@ -260,7 +260,7 @@ void LazyVector::loadVectorInternal() const { } SelectivityVector allRows(BaseVector::length_); loader_->load(allRows, nullptr, size(), &vector_); - VELOX_CHECK(vector_); + VELOX_CHECK_NOT_NULL(vector_); if (vector_->encoding() == VectorEncoding::Simple::LAZY) { vector_ = vector_->asUnchecked()->loadedVectorShared(); } else { @@ -274,7 +274,7 @@ void LazyVector::loadVectorInternal() const { BaseVector::nulls_->as(); } } else { - VELOX_CHECK(vector_); + VELOX_CHECK_NOT_NULL(vector_); } } } // namespace facebook::velox