diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7960ca8305..b010d230b3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -115,7 +115,7 @@ build_train_single_node_with_hdfs_minimal: variables: FROM_IMAGE: ${IMAGE_ALL} DST_IMAGE: $TRAIN_IMAGE_VERSIONED_WITH_HDFS_MINI - CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF -DENABLE_HDFS=MINIMAL" + CMAKE_OPTION: "-DCMAKE_BUILD_TYPE=Release -DKEY_HIT_RATIO=ON -DSM=\"60;61;70;75;80;90\" -DCLANGFORMAT=OFF -DENABLE_HDFS=ON" BUILD_HUGECTR: 1 BUILD_HUGECTR2ONNX: 1 diff --git a/.gitmodules b/.gitmodules index a778699774..f45803cf28 100644 --- a/.gitmodules +++ b/.gitmodules @@ -32,12 +32,6 @@ [submodule "third_party/librdkafka"] path = third_party/librdkafka url = https://github.com/edenhill/librdkafka.git -[submodule "third_party/protobuf"] - path = third_party/protobuf - url = https://github.com/protocolbuffers/protobuf.git -[submodule "third_party/hadoop"] - path = third_party/hadoop - url = https://github.com/apache/hadoop.git [submodule "third_party/HierarchicalKV"] path = third_party/HierarchicalKV url = https://github.com/NVIDIA-Merlin/HierarchicalKV.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 63b5ab334a..6e247b964b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,6 +14,7 @@ # cmake_minimum_required(VERSION 3.17) + project(HugeCTR LANGUAGES CXX CUDA) list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) @@ -351,17 +352,57 @@ add_subdirectory(gpu_cache/src) option(ENABLE_HDFS "Enable HDFS" OFF) if(ENABLE_HDFS) - if(ENABLE_HDFS STREQUAL "MINIMAL") - message("HDFS build mode: Client only") - else() - message("HDFS build mode: Full") - endif() + message(STATUS "HDFS build mode: Client only") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DENABLE_HDFS") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_HDFS") + + set(FETCHCONTENT_QUIET OFF) + + # Java. + if (NOT EXISTS /usr/bin/mvn) + execute_process(WORKING_DIRECTORY "${CMAKE_BINARY_DIR}" + COMMAND /bin/bash ${PROJECT_SOURCE_DIR}/sbin/install-jdk-and-maven.sh + COMMAND_ERROR_IS_FATAL ANY + ) + endif() - # Build and Install Hadoop - include(SetupHadoop) - hadoop_setup(${ENABLE_HDFS}) + # Hadoop. + # sudo apt install libboost-date-time-dev + # sudo apt install libboost-program-options-dev + # sudo apt install libprotobuf-dev + # sudo apt install libfuse-dev + # sudo apt install libprotoc-dev + FetchContent_Declare(hadoop + DOWNLOAD_COMMAND git clone + --branch rel/release-3.4.0 + --depth 1 + --progress https://github.com/apache/hadoop.git + "${CMAKE_BINARY_DIR}/_deps/hadoop-src" + ) + FetchContent_Populate(hadoop) + set(hadoop_SOURCE_DIR "${hadoop_SOURCE_DIR}/hadoop-hdfs-project/hadoop-hdfs-native-client") + set(hadoop_BINARY_DIR "${hadoop_SOURCE_DIR}/target/hadoop-hdfs-native-client-3.4.0") + if(EXISTS ${hadoop_BINARY_DIR}/include/hdfs.h AND EXISTS ${hadoop_BINARY_DIR}/lib/native/libhdfs.a) + message(STATUS "Found hdfs library in ${hadoop_BINARY_DIR}") + else() + execute_process(WORKING_DIRECTORY "${hadoop_SOURCE_DIR}" + COMMAND mvn clean package + -Pdist,native + -DskipTests + -Dtar + -Dmaven.javadoc.skip=true + -Drequire.snappy + -Drequire.zstd + -Drequire.openssl + -Drequire.pmdk + COMMAND_ERROR_IS_FATAL ANY + ) + endif() + set(FETCHCONTENT_QUIET ON) + + include_directories("${hadoop_BINARY_DIR}/include") + link_directories("${hadoop_BINARY_DIR}/lib/native") + set(ENABLE_HDFS ON) endif() diff --git a/HugeCTR/src/CMakeLists.txt b/HugeCTR/src/CMakeLists.txt index 66be2d3b9f..6a6158ea9f 100755 --- a/HugeCTR/src/CMakeLists.txt +++ b/HugeCTR/src/CMakeLists.txt @@ -67,7 +67,7 @@ target_link_libraries(huge_ctr_shared PRIVATE nlohmann_json::nlohmann_json) target_link_libraries(huge_ctr_shared PUBLIC gpu_cache) if(ENABLE_HDFS) - target_link_libraries(huge_ctr_shared PUBLIC ${DB_LIB_PATHS}/libhdfs.so) + target_link_libraries(huge_ctr_shared PUBLIC hdfs) endif() if(ENABLE_S3) diff --git a/HugeCTR/src/hps/CMakeLists.txt b/HugeCTR/src/hps/CMakeLists.txt index db9a9b28a9..a481e9ca86 100644 --- a/HugeCTR/src/hps/CMakeLists.txt +++ b/HugeCTR/src/hps/CMakeLists.txt @@ -36,11 +36,7 @@ add_compile_definitions(LIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE) add_library(huge_ctr_hps SHARED ${huge_ctr_hps_src}) if(ENABLE_HDFS) - target_link_libraries( - huge_ctr_hps - PUBLIC - ${DB_LIB_PATHS}/libhdfs.so # from Hugectr - ) + target_link_libraries(huge_ctr_hps PUBLIC hdfs) endif() if(ENABLE_S3) diff --git a/HugeCTR/src/inference_benchmark/CMakeLists.txt b/HugeCTR/src/inference_benchmark/CMakeLists.txt index bd7add40da..5873701c10 100644 --- a/HugeCTR/src/inference_benchmark/CMakeLists.txt +++ b/HugeCTR/src/inference_benchmark/CMakeLists.txt @@ -20,11 +20,7 @@ file(GLOB hps_benchmark_src ) if(ENABLE_HDFS) - target_link_libraries( - huge_ctr_inference - PUBLIC - ${DB_LIB_PATHS}/libhdfs.so # from Hugectr - ) + target_link_libraries(huge_ctr_inference PUBLIC hdfs) endif() if(ENABLE_S3) diff --git a/docs/source/hugectr_contributor_guide.md b/docs/source/hugectr_contributor_guide.md index 50b8fe2a90..431a5341f1 100755 --- a/docs/source/hugectr_contributor_guide.md +++ b/docs/source/hugectr_contributor_guide.md @@ -104,10 +104,10 @@ To build HugeCTR Training Container from source, do the following: - **ENABLE_INFERENCE**: You can use this option to build HugeCTR in inference mode, which was designed for the inference framework. In this mode, an inference shared library will be built for the HugeCTR Backend. Only interfaces that support the HugeCTR Backend can be used. Therefore, you can’t train models in this mode. This option is set to OFF by default. For building inference container, please refer to [Build HugeCTR Inference Container from Source](#build-hugectr-inference-container-from-source) - - **ENABLE_HDFS**: You can use this option to build HugeCTR together with HDFS to enable HDFS related functions. Permissible values are `ON`, `MINIMAL` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary Hadoop modules that are required for building AND running both HugeCTR and HDFS. In contrast, `MINIMAL` restricts building only the minimum necessary set of components for building HugeCTR. + - **ENABLE_HDFS**: You can use this option to build HugeCTR together with HDFS to enable HDFS related functions. Permissible values are `ON` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary Hadoop modules that are required for building so that it can connect to HDFS deployments. - **ENABLE_S3**: You can use this option to build HugeCTR together with Amazon AWS S3 SDK to enable S3 related functions. Permissible values are `ON` and `OFF` *(default)*. Setting this option to `ON` leads to building all necessary AWS SKKs and dependencies that are required for building AND running both HugeCTR and S3. - **Please note that setting DENABLE_HDFS=ON/MINIMAL or DENABLE_S3=ON requires root permission. So before using these two options to do the customized building, make sure you use `-u root` when you run the docker container.** + **Please note that setting DENABLE_HDFS=ON or DENABLE_S3=ON requires root permission. So before using these two options to do the customized building, make sure you use `-u root` when you run the docker container.** Here are some examples of how you can build HugeCTR using these build options: ```shell @@ -124,7 +124,7 @@ To build HugeCTR Training Container from source, do the following: ```shell $ mkdir -p build && cd build - $ cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;80" -DENABLE_HDFS=MINIMAL .. # Target is NVIDIA V100 / A100 with only minimum HDFS components mode on. + $ cmake -DCMAKE_BUILD_TYPE=Release -DSM="70;80" -DENABLE_HDFS=ON .. # Target is NVIDIA V100 / A100 with HDFS components mode on. $ make -j && make install ``` diff --git a/sbin/install-hadoop.sh b/sbin/install-hadoop.sh index a16905d479..d7c4660894 100755 --- a/sbin/install-hadoop.sh +++ b/sbin/install-hadoop.sh @@ -40,7 +40,7 @@ if [[ ! -f "${HADOOP_HOME}/include/hdfs.h" ]]; then cp hadoop-hdfs-project/hadoop-hdfs-native-client/src/main/native/libhdfs/include/hdfs/hdfs.h ${HADOOP_HOME}/include fi -# Cleanup reundant files. +# Cleanup redundant files. for f in $(find ${HADOOP_HOME} -name *.cmd); do rm -rf $f done diff --git a/third_party/hadoop b/third_party/hadoop deleted file mode 160000 index a585a73c3e..0000000000 --- a/third_party/hadoop +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a585a73c3e02ac62350c136643a5e7f6095a3dbb diff --git a/third_party/protobuf b/third_party/protobuf deleted file mode 160000 index 22d0e265de..0000000000 --- a/third_party/protobuf +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 22d0e265de7d2b3d2e9a00d071313502e7d4cccf diff --git a/tools/dockerfiles/Dockerfile.optimized b/tools/dockerfiles/Dockerfile.optimized index 415b797e1e..7522db9eba 100644 --- a/tools/dockerfiles/Dockerfile.optimized +++ b/tools/dockerfiles/Dockerfile.optimized @@ -26,6 +26,11 @@ ARG RELEASE=true RUN apt-get update -y && \ DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + libboost-date-time-dev \ + libboost-program-options-dev \ + libprotobuf-dev \ + libprotoc-dev \ + libfuse-dev \ clang-format \ libtbb-dev \ libaio-dev && \