diff --git a/thirdparty/CHANGELOG.md b/thirdparty/CHANGELOG.md index 065fd187da536c..6065fdf4485f4d 100644 --- a/thirdparty/CHANGELOG.md +++ b/thirdparty/CHANGELOG.md @@ -170,6 +170,13 @@ Now there will be 2 set of libhdfs, one is without kerberos, the other is with k ## v20211215 +## v20240521 +- Modified: arrow 7.0.0 -> 13.0.0 +- Modified: jemalloc for arrow 5.2.1 -> 5.3.0 +- Modified: xsimd 7.0.0 -> 13.0.0 +- Added: c-ares -> 1.19.1 +- Added: grpc -> 1.54.3 + ### Changes - Added: cyrus-sasl diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index aeb4e580e0042a..35828b8bdcee5c 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -958,6 +958,8 @@ build_arrow() { export ARROW_ZLIB_URL="${TP_SOURCE_DIR}/${ZLIB_NAME}" export ARROW_XSIMD_URL="${TP_SOURCE_DIR}/${XSIMD_NAME}" export ARROW_ORC_URL="${TP_SOURCE_DIR}/${ORC_NAME}" + export ARROW_GRPC_URL="${TP_SOURCE_DIR}/${GRPC_NAME}" + export ARROW_PROTOBUF_URL="${TP_SOURCE_DIR}/${PROTOBUF_NAME}" if [[ "${KERNEL}" != 'Darwin' ]]; then ldflags="-L${TP_LIB_DIR} -static-libstdc++ -static-libgcc" @@ -973,22 +975,38 @@ build_arrow() { -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \ -DCMAKE_INSTALL_LIBDIR=lib64 \ -DARROW_BOOST_USE_SHARED=OFF \ + -DARROW_WITH_GRPC=ON \ + -DgRPC_SOURCE=SYSTEM \ + -DgRPC_ROOT="${TP_INSTALL_DIR}" \ + -DARROW_WITH_PROTOBUF=ON \ + -DProtobuf_SOURCE=SYSTEM \ + -DProtobuf_LIB="${TP_INSTALL_DIR}/lib/libprotoc.a" -DProtobuf_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \ + -DARROW_FLIGHT=ON \ + -DARROW_FLIGHT_SQL=ON \ -DBoost_USE_STATIC_RUNTIME=ON \ -DARROW_GFLAGS_USE_SHARED=OFF \ -Dgflags_ROOT="${TP_INSTALL_DIR}" \ -DGLOG_ROOT="${TP_INSTALL_DIR}" \ -DRE2_ROOT="${TP_INSTALL_DIR}" \ + -DZLIB_SOURCE=SYSTEM \ -DZLIB_LIBRARY="${TP_INSTALL_DIR}/lib/libz.a" -DZLIB_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \ + -DRapidJSON_SOURCE=SYSTEM \ -DRapidJSON_ROOT="${TP_INSTALL_DIR}" \ -DORC_ROOT="${TP_INSTALL_DIR}" \ + -Dxsimd_SOURCE=BUNDLED \ -DBrotli_SOURCE=BUNDLED \ + -DARROW_LZ4_USE_SHARED=OFF \ -DLZ4_LIB="${TP_INSTALL_DIR}/lib/liblz4.a" -DLZ4_INCLUDE_DIR="${TP_INSTALL_DIR}/include/lz4" \ -DLz4_SOURCE=SYSTEM \ + -DARROW_ZSTD_USE_SHARED=OFF \ -DZSTD_LIB="${TP_INSTALL_DIR}/lib/libzstd.a" -DZSTD_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \ -Dzstd_SOURCE=SYSTEM \ -DSnappy_LIB="${TP_INSTALL_DIR}/lib/libsnappy.a" -DSnappy_INCLUDE_DIR="${TP_INSTALL_DIR}/include" \ -DSnappy_SOURCE=SYSTEM \ -DBOOST_ROOT="${TP_INSTALL_DIR}" --no-warn-unused-cli \ + -Djemalloc_SOURCE=BUNDLED \ + -DARROW_THRIFT_USE_SHARED=OFF \ + -DThrift_SOURCE=SYSTEM \ -DThrift_ROOT="${TP_INSTALL_DIR}" .. "${BUILD_SYSTEM}" -j "${PARALLEL}" @@ -1662,6 +1680,56 @@ build_libdeflate() { "${BUILD_SYSTEM}" install } +# c-ares +build_cares() { + check_if_source_exist "${CARES_SOURCE}" + cd "${TP_SOURCE_DIR}/${CARES_SOURCE}" + + mkdir -p build + cd build + cmake -DCMAKE_BUILD_TYPE=Release \ + -DCARES_STATIC=ON \ + -DCARES_SHARED=OFF \ + -DCARES_STATIC_PIC=ON \ + -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" .. + make + make install +} + +# grpc +build_grpc() { + check_if_source_exist "${GRPC_SOURCE}" + cd "${TP_SOURCE_DIR}/${GRPC_SOURCE}" + + mkdir -p cmake/build + cd cmake/build + + cmake -DgRPC_INSTALL=ON \ + -DgRPC_BUILD_TESTS=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX="${TP_INSTALL_DIR}" \ + -DgRPC_CARES_PROVIDER=package \ + -Dc-ares_DIR="${TP_INSTALL_DIR}" \ + -DgRPC_ABSL_PROVIDER=package \ + -Dabsl_DIR="${TP_INSTALL_DIR}" \ + -DgRPC_PROTOBUF_PROVIDER=package \ + -DProtobuf_DIR="${TP_INSTALL_DIR}" \ + -DgRPC_RE2_PROVIDER=package \ + -Dre2_DIR:STRING="${TP_INSTALL_DIR}" \ + -DgRPC_SSL_PROVIDER=package \ + -DOPENSSL_ROOT_DIR="${TP_INSTALL_DIR}" \ + -DgRPC_ZLIB_PROVIDER=package \ + -DZLIB_ROOT="${TP_INSTALL_DIR}" \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + ../.. + + make -j "${PARALLEL}" + make install + + # for grpc > v1.55, cmake 2.22 does not support find_dependency, delete this line after cmake version upgrade. + # sed -i 's/find_dependency/find_package/g' "${TP_INSTALL_DIR}"/lib64/cmake/grpc/gRPCConfig.cmake +} + if [[ "${#packages[@]}" -eq 0 ]]; then packages=( libunixodbc @@ -1673,9 +1741,9 @@ if [[ "${#packages[@]}" -eq 0 ]]; then lzo2 zstd boost # must before thrift - protobuf gflags gtest + protobuf # after gtest glog rapidjson snappy @@ -1693,6 +1761,8 @@ if [[ "${#packages[@]}" -eq 0 ]]; then librdkafka flatbuffers orc + cares + grpc # after cares, protobuf arrow abseil s2 diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index 783c6fab00700f..f20b707dfadae2 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -323,10 +323,10 @@ fi echo "Finished patching ${OPENTELEMETRY_SOURCE}" # arrow patch is used to get the raw orc reader for filter prune. -if [[ "${ARROW_SOURCE}" == "apache-arrow-7.0.0" ]]; then +if [[ "${ARROW_SOURCE}" == "arrow-apache-arrow-13.0.0" ]]; then cd "${TP_SOURCE_DIR}/${ARROW_SOURCE}" if [[ ! -f "${PATCHED_MARK}" ]]; then - patch -p1 <"${TP_PATCH_DIR}/apache-arrow-7.0.0.patch" + patch -p1 <"${TP_PATCH_DIR}/apache-arrow-13.0.0.patch" touch "${PATCHED_MARK}" fi cd - diff --git a/thirdparty/patches/apache-arrow-13.0.0.patch b/thirdparty/patches/apache-arrow-13.0.0.patch new file mode 100644 index 00000000000000..c59b95eeb1c12c --- /dev/null +++ b/thirdparty/patches/apache-arrow-13.0.0.patch @@ -0,0 +1,120 @@ +diff --git a/cpp/src/arrow/adapters/orc/adapter.cc b/cpp/src/arrow/adapters/orc/adapter.cc +index 2466e7433..46b4402d4 100644 +--- a/cpp/src/arrow/adapters/orc/adapter.cc ++++ b/cpp/src/arrow/adapters/orc/adapter.cc +@@ -47,9 +47,6 @@ + #include "arrow/util/visibility.h" + #include "orc/Exceptions.hh" + +-// alias to not interfere with nested orc namespace +-namespace liborc = orc; +- + #define ORC_THROW_NOT_OK(s) \ + do { \ + Status _s = (s); \ +@@ -202,6 +199,8 @@ class ORCFileReader::Impl { + return Init(); + } + ++ virtual liborc::Reader* GetRawORCReader() { return reader_.get(); } ++ + Status Init() { + int64_t nstripes = reader_->getNumberOfStripes(); + stripes_.resize(nstripes); +@@ -479,6 +478,31 @@ class ORCFileReader::Impl { + return Status::OK(); + } + ++ Result> NextStripeReader( ++ int64_t batch_size, const std::vector& include_names) { ++ if (current_row_ >= NumberOfRows()) { ++ return nullptr; ++ } ++ ++ liborc::RowReaderOptions opts = default_row_reader_options(); ++ if (!include_names.empty()) { ++ RETURN_NOT_OK(SelectNames(&opts, include_names)); ++ } ++ StripeInformation stripe_info({0, 0, 0, 0}); ++ RETURN_NOT_OK(SelectStripeWithRowNumber(&opts, current_row_, &stripe_info)); ++ ARROW_ASSIGN_OR_RAISE(auto schema, ReadSchema(opts)); ++ std::unique_ptr row_reader; ++ ++ ORC_BEGIN_CATCH_NOT_OK ++ row_reader = reader_->createRowReader(opts); ++ row_reader->seekToRow(current_row_); ++ current_row_ = stripe_info.first_row_id + stripe_info.num_rows; ++ ORC_END_CATCH_NOT_OK ++ ++ return std::make_shared(std::move(row_reader), schema, batch_size, ++ pool_); ++ } ++ + Result> NextStripeReader( + int64_t batch_size, const std::vector& include_indices) { + if (current_row_ >= NumberOfRows()) { +@@ -544,6 +568,8 @@ Result> ORCFileReader::Open( + return std::move(result); + } + ++liborc::Reader* ORCFileReader::GetRawORCReader() { return impl_->GetRawORCReader(); } ++ + Result> ORCFileReader::ReadMetadata() { + return impl_->ReadMetadata(); + } +@@ -605,6 +631,11 @@ Result> ORCFileReader::NextStripeReader( + return impl_->NextStripeReader(batch_size, include_indices); + } + ++Result> ORCFileReader::NextStripeReader( ++ int64_t batch_size, const std::vector& include_names) { ++ return impl_->NextStripeReader(batch_size, include_names); ++} ++ + int64_t ORCFileReader::NumberOfStripes() { return impl_->NumberOfStripes(); } + + int64_t ORCFileReader::NumberOfRows() { return impl_->NumberOfRows(); } +diff --git a/cpp/src/arrow/adapters/orc/adapter.h b/cpp/src/arrow/adapters/orc/adapter.h +index 013be7860..7fd06bcb8 100644 +--- a/cpp/src/arrow/adapters/orc/adapter.h ++++ b/cpp/src/arrow/adapters/orc/adapter.h +@@ -30,6 +30,10 @@ + #include "arrow/type_fwd.h" + #include "arrow/util/macros.h" + #include "arrow/util/visibility.h" ++#include "orc/Reader.hh" ++ ++// alias to not interfere with nested orc namespace ++namespace liborc = orc; + + namespace arrow { + namespace adapters { +@@ -53,6 +57,9 @@ class ARROW_EXPORT ORCFileReader { + public: + ~ORCFileReader(); + ++ /// \brief Get ORC reader from inside. ++ liborc::Reader* GetRawORCReader(); ++ + /// \brief Creates a new ORC reader + /// + /// \param[in] file the data source +@@ -174,6 +181,19 @@ class ARROW_EXPORT ORCFileReader { + Result> GetRecordBatchReader( + int64_t batch_size, const std::vector& include_names); + ++ /// \brief Get a stripe level record batch iterator with specified row count ++ /// in each record batch. NextStripeReader serves as a fine grain ++ /// alternative to ReadStripe which may cause OOM issue by loading ++ /// the whole stripes into memory. ++ /// ++ /// \param[in] batch_size Get a stripe level record batch iterator with specified row ++ /// count in each record batch. ++ /// ++ /// \param[in] include_names the selected field names to read ++ /// \return the returned stripe reader ++ Result> NextStripeReader( ++ int64_t batch_size, const std::vector& include_names); ++ + /// \brief The number of stripes in the file + int64_t NumberOfStripes(); diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh index 7dd28b1b32c422..6fd3d5173abf8d 100644 --- a/thirdparty/vars.sh +++ b/thirdparty/vars.sh @@ -238,11 +238,24 @@ FLATBUFFERS_NAME=flatbuffers-2.0.0.tar.gz FLATBUFFERS_SOURCE=flatbuffers-2.0.0 FLATBUFFERS_MD5SUM="a27992324c3cbf86dd888268a23d17bd" +# c-ares +CARES_DOWNLOAD="https://github.com/c-ares/c-ares/releases/download/cares-1_19_1/c-ares-1.19.1.tar.gz" +CARES_NAME="c-ares-1.19.1.tar.gz" +CARES_SOURCE=c-ares-1.19.1 +CARES_MD5SUM="dafc5825a92dc907e144570e4e75a908" + +# grpc +# grpc v1.55 and above require protobuf >= 22 +GRPC_DOWNLOAD="https://github.com/grpc/grpc/archive/refs/tags/v1.54.3.tar.gz" +GRPC_NAME="grpc-v1.54.3.tar.gz" +GRPC_SOURCE=grpc-1.54.3 +GRPC_MD5SUM="af00a2edeae0f02bb25917cc3473b7de" + # arrow -ARROW_DOWNLOAD="https://archive.apache.org/dist/arrow/arrow-7.0.0/apache-arrow-7.0.0.tar.gz" -ARROW_NAME="apache-arrow-7.0.0.tar.gz" -ARROW_SOURCE="apache-arrow-7.0.0" -ARROW_MD5SUM="316ade159901646849b3b4760fa52816" +ARROW_DOWNLOAD="https://github.com/apache/arrow/archive/refs/tags/apache-arrow-13.0.0.tar.gz" +ARROW_NAME="apache-arrow-13.0.0.tar.gz" +ARROW_SOURCE="arrow-apache-arrow-13.0.0" +ARROW_MD5SUM="8ec1ec6a119514bcaea1cf7aabc9df1f" # Abseil ABSEIL_DOWNLOAD="https://github.com/abseil/abseil-cpp/archive/refs/tags/20220623.1.tar.gz" @@ -287,10 +300,10 @@ ORC_SOURCE=orc-1.7.2 ORC_MD5SUM="6cab37935eacdec7d078d327746a8578" # jemalloc for arrow -JEMALLOC_ARROW_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.2.1/jemalloc-5.2.1.tar.bz2" -JEMALLOC_ARROW_NAME="jemalloc-5.2.1.tar.bz2" -JEMALLOC_ARROW_SOURCE="jemalloc-5.2.1" -JEMALLOC_ARROW_MD5SUM="3d41fbf006e6ebffd489bdb304d009ae" +JEMALLOC_ARROW_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2" +JEMALLOC_ARROW_NAME="jemalloc-5.3.0.tar.bz2" +JEMALLOC_ARROW_SOURCE="jemalloc-5.3.0" +JEMALLOC_ARROW_MD5SUM="09a8328574dab22a7df848eae6dbbf53" # jemalloc for doris JEMALLOC_DORIS_DOWNLOAD="https://github.com/jemalloc/jemalloc/releases/download/5.3.0/jemalloc-5.3.0.tar.bz2" @@ -399,10 +412,10 @@ BENCHMARK_MD5SUM="8ddf8571d3f6198d37852bcbd964f817" # xsimd # for arrow-7.0.0, if arrow upgrade, this version may also need to be changed -XSIMD_DOWNLOAD="https://github.com/xtensor-stack/xsimd/archive/aeec9c872c8b475dedd7781336710f2dd2666cb2.tar.gz" -XSIMD_NAME=xsimd-aeec9c872c8b475dedd7781336710f2dd2666cb2.tar.gz -XSIMD_SOURCE=xsimd-aeec9c872c8b475dedd7781336710f2dd2666cb2 -XSIMD_MD5SUM="d024855f71c0a2837a6918c0f8f66245" +XSIMD_DOWNLOAD="https://github.com/xtensor-stack/xsimd/archive/refs/tags/9.0.1.tar.gz" +XSIMD_NAME="xsimd-9.0.1.tar.gz" +XSIMD_SOURCE=xsimd-9.0.1 +XSIMD_MD5SUM="59f38fe3364acd7ed137771258812d6c" # simdjson SIMDJSON_DOWNLOAD="https://github.com/simdjson/simdjson/archive/refs/tags/v3.0.1.tar.gz" @@ -505,6 +518,8 @@ export TP_ARCHIVES=( 'CYRUS_SASL' 'LIBRDKAFKA' 'FLATBUFFERS' + 'CARES' + 'GRPC' 'ARROW' 'BROTLI' 'ZSTD'