From ee5bbbaed6ccb1810bb2c909d284bd523b5d3473 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 8 Jan 2023 22:35:50 -0600 Subject: [PATCH 01/17] [ci] speed up if-else, swig, and lint conda setup --- .ci/test.sh | 107 +++++++++++++++++++++++++++------------------------- 1 file changed, 55 insertions(+), 52 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index f18198b5924f..137f3c58a0b5 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -34,7 +34,61 @@ if [[ "$TASK" == "cpp-tests" ]]; then exit 0 fi -conda create -q -y -n $CONDA_ENV "python=$PYTHON_VERSION[build=*cpython]" +CONDA_PYTHON_REQUIREMENT="python=$PYTHON_VERSION[build=*cpython]" + +if [[ $TASK == "if-else" ]]; then + conda create -q -y -n $CONDA_ENV ${CONDA_PYTHON_REQUIREMENT} numpy + mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1 + cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1 + cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1 + cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit -1 + exit 0 +fi + +if [[ $TASK == "swig" ]]; then + mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build + if [[ $OS_NAME == "macos" ]]; then + cmake -DUSE_SWIG=ON -DAPPLE_OUTPUT_DYLIB=ON .. + else + cmake -DUSE_SWIG=ON .. + fi + make -j4 || exit -1 + if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then + objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit -1 + objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit -1 + python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit -1 + fi + if [[ $PRODUCES_ARTIFACTS == "true" ]]; then + cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar + fi + exit 0 +fi + +if [[ $TASK == "lint" ]]; then + conda create -q -y -n $CONDA_ENV \ + ${CONDA_PYTHON_REQUIREMENT} \ + cmakelint \ + cpplint \ + isort \ + mypy \ + pycodestyle \ + pydocstyle \ + "r-lintr>=3.0" + echo "Linting Python code" + pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1 + pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 + isort . --check-only || exit -1 + mypy --ignore-missing-imports python-package/ || true + echo "Linting R code" + Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 + echo "Linting C++ code" + cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1 + cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake") + cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1 + exit 0 +fi + +conda create -q -y -n $CONDA_ENV "${CONDA_PYTHON_REQUIREMENT}" source activate $CONDA_ENV cd $BUILD_DIRECTORY @@ -72,57 +126,6 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then exit 0 fi -if [[ $TASK == "lint" ]]; then - conda install -q -y -n $CONDA_ENV \ - cmakelint \ - cpplint \ - isort \ - mypy \ - pycodestyle \ - pydocstyle \ - "r-lintr>=3.0" - echo "Linting Python code" - pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1 - pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 - isort . --check-only || exit -1 - mypy --ignore-missing-imports python-package/ || true - echo "Linting R code" - Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 - echo "Linting C++ code" - cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1 - cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake") - cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1 - exit 0 -fi - -if [[ $TASK == "if-else" ]]; then - conda install -q -y -n $CONDA_ENV numpy - mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1 - cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1 - cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1 - cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit -1 - exit 0 -fi - -if [[ $TASK == "swig" ]]; then - mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build - if [[ $OS_NAME == "macos" ]]; then - cmake -DUSE_SWIG=ON -DAPPLE_OUTPUT_DYLIB=ON .. - else - cmake -DUSE_SWIG=ON .. - fi - make -j4 || exit -1 - if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then - objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit -1 - objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit -1 - python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit -1 - fi - if [[ $PRODUCES_ARTIFACTS == "true" ]]; then - cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar - fi - exit 0 -fi - # re-including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy conda install -q -y -n $CONDA_ENV \ cloudpickle \ From 12f8f555b7b8d67793ab3bbf9eadd047c827d249 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 8 Jan 2023 22:47:14 -0600 Subject: [PATCH 02/17] add 'source activate' --- .ci/test.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.ci/test.sh b/.ci/test.sh index 137f3c58a0b5..6d7ecfc6635d 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -38,6 +38,7 @@ CONDA_PYTHON_REQUIREMENT="python=$PYTHON_VERSION[build=*cpython]" if [[ $TASK == "if-else" ]]; then conda create -q -y -n $CONDA_ENV ${CONDA_PYTHON_REQUIREMENT} numpy + source activate $CONDA_ENV mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1 cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1 cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1 @@ -74,6 +75,7 @@ if [[ $TASK == "lint" ]]; then pycodestyle \ pydocstyle \ "r-lintr>=3.0" + source activate $CONDA_ENV echo "Linting Python code" pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1 pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 From a981a0741db8dddf2f9cd9fbbac61f5369626455 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 8 Jan 2023 22:47:56 -0600 Subject: [PATCH 03/17] python constraint --- .ci/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/test.sh b/.ci/test.sh index 6d7ecfc6635d..c66f24abc74c 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -139,7 +139,7 @@ conda install -q -y -n $CONDA_ENV \ pandas \ psutil \ pytest \ - "python=$PYTHON_VERSION[build=*cpython]" \ + ${CONDA_PYTHON_REQUIREMENT} \ python-graphviz \ scikit-learn \ scipy || exit -1 From 05ee8a124dfee077db61143a1cf7f292fa5de16f Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 16 Jan 2023 22:58:44 -0600 Subject: [PATCH 04/17] start removing cuda v1 --- .ci/test.sh | 35 +- .github/workflows/cuda.yml | 20 +- CMakeLists.txt | 33 +- docs/Installation-Guide.rst | 4 +- docs/Parameters.rst | 6 +- include/LightGBM/config.h | 4 +- python-package/setup.py | 7 +- src/objective/objective_function.cpp | 24 +- src/treelearner/cuda_kernel_launcher.cu | 171 ---- src/treelearner/cuda_kernel_launcher.h | 70 -- src/treelearner/cuda_tree_learner.cpp | 1031 ----------------------- src/treelearner/cuda_tree_learner.h | 261 ------ src/treelearner/serial_tree_learner.cpp | 8 - 13 files changed, 47 insertions(+), 1627 deletions(-) delete mode 100644 src/treelearner/cuda_kernel_launcher.cu delete mode 100644 src/treelearner/cuda_kernel_launcher.h delete mode 100644 src/treelearner/cuda_tree_learner.cpp delete mode 100644 src/treelearner/cuda_tree_learner.h diff --git a/.ci/test.sh b/.ci/test.sh index c66f24abc74c..12b4ae027993 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -201,41 +201,24 @@ if [[ $TASK == "gpu" ]]; then elif [[ $METHOD == "source" ]]; then cmake -DUSE_GPU=ON .. fi -elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then - if [[ $TASK == "cuda" ]]; then - sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h - grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done - else - sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda_exp";/' $BUILD_DIRECTORY/include/LightGBM/config.h - grep -q 'std::string device_type = "cuda_exp"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done - # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests - sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h - grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done - fi +elif [[ $TASK == "cuda" ]]; then + sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h + grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done + # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests + sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h + grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1 # make sure that changes were really done if [[ $METHOD == "pip" ]]; then cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1 - if [[ $TASK == "cuda" ]]; then - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 - else - pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda-exp || exit -1 - fi + pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1 pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1 exit 0 elif [[ $METHOD == "wheel" ]]; then - if [[ $TASK == "cuda" ]]; then - cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1 - else - cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda-exp || exit -1 - fi + cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1 pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1 pytest $BUILD_DIRECTORY/tests || exit -1 exit 0 elif [[ $METHOD == "source" ]]; then - if [[ $TASK == "cuda" ]]; then - cmake -DUSE_CUDA=ON .. - else - cmake -DUSE_CUDA_EXP=ON .. - fi + cmake -DUSE_CUDA=ON .. fi elif [[ $TASK == "mpi" ]]; then if [[ $METHOD == "pip" ]]; then diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 946f548784a6..ca17ff2ca587 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -28,31 +28,21 @@ jobs: fail-fast: false matrix: include: - - method: source - compiler: gcc - python_version: "3.8" - cuda_version: "11.7.1" - task: cuda - - method: pip - compiler: clang - python_version: "3.9" - cuda_version: "10.0" - task: cuda - method: wheel compiler: gcc python_version: "3.10" - cuda_version: "9.0" + cuda_version: "11.7.1" task: cuda - method: source compiler: gcc python_version: "3.8" - cuda_version: "11.7.1" - task: cuda_exp + cuda_version: "10.0" + task: cuda - method: pip compiler: clang python_version: "3.9" - cuda_version: "10.0" - task: cuda_exp + cuda_version: "11.7.1" + task: cuda steps: - name: Setup or update software on host machine run: | diff --git a/CMakeLists.txt b/CMakeLists.txt index b7fa5dc8f330..8d005662f37a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,8 +4,8 @@ option(USE_GPU "Enable GPU-accelerated training" OFF) option(USE_SWIG "Enable SWIG to generate Java API" OFF) option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) option(USE_TIMETAG "Set to ON to output time costs" OFF) -option(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF) -option(USE_CUDA_EXP "Enable CUDA-accelerated training with more acceleration (EXPERIMENTAL)" OFF) +option(USE_CUDA "Enable CUDA-accelerated training " OFF) +option(USE_CUDA_EXP "(DEPRECATED) Alias for 'USE_CUDA'. Use 'USE_CUDA' instead." OFF) option(USE_DEBUG "Set to ON for Debug mode" OFF) option(USE_SANITIZER "Use santizer flags" OFF) set( @@ -31,7 +31,7 @@ elseif(USE_SWIG) cmake_minimum_required(VERSION 3.8) elseif(USE_GPU OR APPLE) cmake_minimum_required(VERSION 3.2) -elseif(USE_CUDA OR USE_CUDA_EXP) +elseif(USE_CUDA) cmake_minimum_required(VERSION 3.16) else() cmake_minimum_required(VERSION 3.0) @@ -137,7 +137,12 @@ else() add_definitions(-DUSE_SOCKET) endif() -if(USE_CUDA OR USE_CUDA_EXP) +if(USE_CUDA_EXP) + message(WARNING "Option -DUSE_CUDA_EXP=ON is deprecated. Use -DUSE_CUDA=ON instead.") + set(USE_CUDA ON CACHE BOOL "Building CUDA-enabled version" FORCE) +endif() + +if(USE_CUDA) set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") enable_language(CUDA) set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE) @@ -192,12 +197,8 @@ if(__INTEGRATE_OPENCL) endif() endif() -if(USE_CUDA OR USE_CUDA_EXP) - if(USE_CUDA) - find_package(CUDA 9.0 REQUIRED) - else() - find_package(CUDA 10.0 REQUIRED) - endif() +if(USE_CUDA) + find_package(CUDA 10.0 REQUIRED) include_directories(${CUDA_INCLUDE_DIRS}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall") @@ -224,11 +225,7 @@ if(USE_CUDA OR USE_CUDA_EXP) endif() message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") - if(USE_CUDA) - add_definitions(-DUSE_CUDA) - elseif(USE_CUDA_EXP) - add_definitions(-DUSE_CUDA_EXP) - endif() + add_definitions(-DUSE_CUDA) if(NOT DEFINED CMAKE_CUDA_STANDARD) set(CMAKE_CUDA_STANDARD 11) @@ -411,10 +408,8 @@ file( src/objective/*.cpp src/network/*.cpp src/treelearner/*.cpp -if(USE_CUDA OR USE_CUDA_EXP) +if(USE_CUDA) src/treelearner/*.cu -endif() -if(USE_CUDA_EXP) src/boosting/cuda/*.cpp src/boosting/cuda/*.cu src/metric/cuda/*.cpp @@ -549,7 +544,7 @@ if(__INTEGRATE_OPENCL) target_link_libraries(lightgbm_objs PUBLIC ${INTEGRATED_OPENCL_LIBRARIES} ${CMAKE_DL_LIBS}) endif() -if(USE_CUDA OR USE_CUDA_EXP) +if(USE_CUDA) # Disable cmake warning about policy CMP0104. Refer to issue #3754 and PR #4268. # Custom target properties does not propagate, thus we need to specify for # each target that contains or depends on cuda source. diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst index 3f8cc45f2160..6ce3f2f0c987 100644 --- a/docs/Installation-Guide.rst +++ b/docs/Installation-Guide.rst @@ -621,7 +621,7 @@ On Linux a CUDA version of LightGBM can be built using **CUDA**, **CMake** and * The following dependencies should be installed before compilation: -- **CUDA** 9.0 or later libraries. Please refer to `this detailed guide`_. Pay great attention to the minimum required versions of host compilers listed in the table from that guide and use only recommended versions of compilers. +- **CUDA** 10.0 or later libraries. Please refer to `this detailed guide`_. Pay great attention to the minimum required versions of host compilers listed in the table from that guide and use only recommended versions of compilers. - **CMake** 3.16 or later. @@ -636,8 +636,6 @@ To build LightGBM CUDA version, run the following commands: cmake -DUSE_CUDA=1 .. make -j4 -Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``-DUSE_CUDA`` with ``-DUSE_CUDA_EXP`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. - **Note**: glibc >= 2.14 is required. **Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this). diff --git a/docs/Parameters.rst b/docs/Parameters.rst index 4ac77d407ba6..d2cb9acdaf6d 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -205,7 +205,7 @@ Core Parameters - **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors -- ``device_type`` :raw-html:`🔗︎`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, ``cuda_exp``, aliases: ``device`` +- ``device_type`` :raw-html:`🔗︎`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, aliases: ``device`` - device for the tree learning, you can use GPU to achieve the faster learning @@ -215,10 +215,6 @@ Core Parameters - **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support - - **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda`` - - - **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future - - ``seed`` :raw-html:`🔗︎`, default = ``None``, type = int, aliases: ``random_seed``, ``random_state`` - this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc. diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index 4456d19b4da3..c2de146b3854 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -223,14 +223,12 @@ struct Config { // [doc-only] // type = enum - // options = cpu, gpu, cuda, cuda_exp + // options = cpu, gpu, cuda // alias = device // desc = device for the tree learning, you can use GPU to achieve the faster learning // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support - // desc = **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda`` - // desc = **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future std::string device_type = "cpu"; // [doc-only] diff --git a/python-package/setup.py b/python-package/setup.py index f40229f2c430..af07bfbdd5b4 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -128,6 +128,9 @@ def compile_cpp( logger.info("Starting to compile the library.") + if use_cuda_exp: + use_cuda = True + cmake_cmd = ["cmake", str(CURRENT_DIR / "compile")] if integrated_opencl: use_gpu = False @@ -148,8 +151,6 @@ def compile_cpp( cmake_cmd.append(f"-DOpenCL_LIBRARY={opencl_library}") elif use_cuda: cmake_cmd.append("-DUSE_CUDA=ON") - elif use_cuda_exp: - cmake_cmd.append("-DUSE_CUDA_EXP=ON") if use_mpi: cmake_cmd.append("-DUSE_MPI=ON") if nomp: @@ -171,7 +172,7 @@ def compile_cpp( else: status = 1 lib_path = CURRENT_DIR / "compile" / "windows" / "x64" / "DLL" / "lib_lightgbm.dll" - if not any((use_gpu, use_cuda, use_cuda_exp, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)): + if not any((use_gpu, use_cuda, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)): logger.info("Starting to compile with MSBuild from existing solution file.") platform_toolsets = ("v143", "v142", "v141", "v140") for pt in platform_toolsets: diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp index 79749570d672..3ce424721f86 100644 --- a/src/objective/objective_function.cpp +++ b/src/objective/objective_function.cpp @@ -18,8 +18,8 @@ namespace LightGBM { ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) { - #ifdef USE_CUDA_EXP - if (config.device_type == std::string("cuda_exp") && + #ifdef USE_CUDA + if (config.device_type == std::string("cuda") && config.data_sample_strategy != std::string("goss") && config.boosting != std::string("rf")) { if (type == std::string("regression")) { @@ -27,7 +27,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& } else if (type == std::string("regression_l1")) { return new CUDARegressionL1loss(config); } else if (type == std::string("quantile")) { - Log::Warning("Objective quantile is not implemented in cuda_exp version. Fall back to boosting on CPU."); + Log::Warning("Objective quantile is not implemented in cuda version. Fall back to boosting on CPU."); return new RegressionQuantileloss(config); } else if (type == std::string("huber")) { return new CUDARegressionHuberLoss(config); @@ -46,26 +46,26 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& } else if (type == std::string("multiclassova")) { return new CUDAMulticlassOVA(config); } else if (type == std::string("cross_entropy")) { - Log::Warning("Objective cross_entropy is not implemented in cuda_exp version. Fall back to boosting on CPU."); + Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU."); return new CrossEntropy(config); } else if (type == std::string("cross_entropy_lambda")) { - Log::Warning("Objective cross_entropy_lambda is not implemented in cuda_exp version. Fall back to boosting on CPU."); + Log::Warning("Objective cross_entropy_lambda is not implemented in cuda version. Fall back to boosting on CPU."); return new CrossEntropyLambda(config); } else if (type == std::string("mape")) { - Log::Warning("Objective mape is not implemented in cuda_exp version. Fall back to boosting on CPU."); + Log::Warning("Objective mape is not implemented in cuda version. Fall back to boosting on CPU."); return new RegressionMAPELOSS(config); } else if (type == std::string("gamma")) { - Log::Warning("Objective gamma is not implemented in cuda_exp version. Fall back to boosting on CPU."); + Log::Warning("Objective gamma is not implemented in cuda version. Fall back to boosting on CPU."); return new RegressionGammaLoss(config); } else if (type == std::string("tweedie")) { - Log::Warning("Objective tweedie is not implemented in cuda_exp version. Fall back to boosting on CPU."); + Log::Warning("Objective tweedie is not implemented in cuda version. Fall back to boosting on CPU."); return new RegressionTweedieLoss(config); } else if (type == std::string("custom")) { - Log::Warning("Using customized objective with cuda_exp. This requires copying gradients from CPU to GPU, which can be slow."); + Log::Warning("Using customized objective with cuda. This requires copying gradients from CPU to GPU, which can be slow."); return nullptr; } } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA if (type == std::string("regression")) { return new RegressionL2loss(config); } else if (type == std::string("regression_l1")) { @@ -101,9 +101,9 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& } else if (type == std::string("custom")) { return nullptr; } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA Log::Fatal("Unknown objective type name: %s", type.c_str()); return nullptr; } diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu deleted file mode 100644 index 05724695100e..000000000000 --- a/src/treelearner/cuda_kernel_launcher.cu +++ /dev/null @@ -1,171 +0,0 @@ -/*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifdef USE_CUDA - -#include "cuda_kernel_launcher.h" - -#include - -#include - -#include - -namespace LightGBM { - -void cuda_histogram( - int histogram_size, - data_size_t leaf_num_data, - data_size_t num_data, - bool use_all_features, - bool is_constant_hessian, - int num_workgroups, - cudaStream_t stream, - uint8_t* arg0, - uint8_t* arg1, - data_size_t arg2, - data_size_t* arg3, - data_size_t arg4, - score_t* arg5, - score_t* arg6, - score_t arg6_const, - char* arg7, - volatile int* arg8, - void* arg9, - size_t exp_workgroups_per_feature) { - if (histogram_size == 16) { - if (leaf_num_data == num_data) { - if (use_all_features) { - if (!is_constant_hessian) - histogram16<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram16_fulldata<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16_fulldata<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - } else { - if (use_all_features) { - // seems all features is always enabled, so this should be the same as fulldata - if (!is_constant_hessian) - histogram16<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram16<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram16<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - } - } else if (histogram_size == 64) { - if (leaf_num_data == num_data) { - if (use_all_features) { - if (!is_constant_hessian) - histogram64<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram64_fulldata<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64_fulldata<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - } else { - if (use_all_features) { - // seems all features is always enabled, so this should be the same as fulldata - if (!is_constant_hessian) - histogram64<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram64<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram64<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - } - } else { - if (leaf_num_data == num_data) { - if (use_all_features) { - if (!is_constant_hessian) - histogram256<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram256_fulldata<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256_fulldata<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - } else { - if (use_all_features) { - // seems all features is always enabled, so this should be the same as fulldata - if (!is_constant_hessian) - histogram256<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } else { - if (!is_constant_hessian) - histogram256<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - else - histogram256<<>>(arg0, arg1, arg2, - arg3, arg4, arg5, - arg6_const, arg7, arg8, static_cast(arg9), exp_workgroups_per_feature); - } - } - } -} - -} // namespace LightGBM - -#endif // USE_CUDA diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h deleted file mode 100644 index 0714e05b2f2d..000000000000 --- a/src/treelearner/cuda_kernel_launcher.h +++ /dev/null @@ -1,70 +0,0 @@ -/*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_ -#define LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_ - -#ifdef USE_CUDA -#include -#include "kernels/histogram_16_64_256.hu" // kernel, acc_type, data_size_t, uchar, score_t - -namespace LightGBM { - -struct ThreadData { - // device id - int device_id; - // parameters for cuda_histogram - int histogram_size; - data_size_t leaf_num_data; - data_size_t num_data; - bool use_all_features; - bool is_constant_hessian; - int num_workgroups; - cudaStream_t stream; - uint8_t* device_features; - uint8_t* device_feature_masks; - data_size_t* device_data_indices; - score_t* device_gradients; - score_t* device_hessians; - score_t hessians_const; - char* device_subhistograms; - volatile int* sync_counters; - void* device_histogram_outputs; - size_t exp_workgroups_per_feature; - // cuda events - cudaEvent_t* kernel_start; - cudaEvent_t* kernel_wait_obj; - std::chrono::duration* kernel_input_wait_time; - // copy histogram - size_t output_size; - char* host_histogram_output; - cudaEvent_t* histograms_wait_obj; -}; - - -void cuda_histogram( - int histogram_size, - data_size_t leaf_num_data, - data_size_t num_data, - bool use_all_features, - bool is_constant_hessian, - int num_workgroups, - cudaStream_t stream, - uint8_t* arg0, - uint8_t* arg1, - data_size_t arg2, - data_size_t* arg3, - data_size_t arg4, - score_t* arg5, - score_t* arg6, - score_t arg6_const, - char* arg7, - volatile int* arg8, - void* arg9, - size_t exp_workgroups_per_feature); - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_ diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp deleted file mode 100644 index a6bd4c47ae06..000000000000 --- a/src/treelearner/cuda_tree_learner.cpp +++ /dev/null @@ -1,1031 +0,0 @@ -/*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifdef USE_CUDA -#include "cuda_tree_learner.h" - -#include -#include -#include -#include -#include - -#include - -#include -#include -#include - -#include "../io/dense_bin.hpp" - -namespace LightGBM { - -#define cudaMemcpy_DEBUG 0 // 1: DEBUG cudaMemcpy -#define ResetTrainingData_DEBUG 0 // 1: Debug ResetTrainingData - -#define CUDA_DEBUG 0 - -static void *launch_cuda_histogram(void *thread_data) { - ThreadData td = *(reinterpret_cast(thread_data)); - int device_id = td.device_id; - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - - // launch cuda kernel - cuda_histogram(td.histogram_size, - td.leaf_num_data, td.num_data, td.use_all_features, - td.is_constant_hessian, td.num_workgroups, td.stream, - td.device_features, - td.device_feature_masks, - td.num_data, - td.device_data_indices, - td.leaf_num_data, - td.device_gradients, - td.device_hessians, td.hessians_const, - td.device_subhistograms, td.sync_counters, - td.device_histogram_outputs, - td.exp_workgroups_per_feature); - - CUDASUCCESS_OR_FATAL(cudaGetLastError()); - - return NULL; -} - -CUDATreeLearner::CUDATreeLearner(const Config* config) - :SerialTreeLearner(config) { - use_bagging_ = false; - nthreads_ = 0; - if (config->gpu_use_dp && USE_DP_FLOAT) { - Log::Info("LightGBM using CUDA trainer with DP float!!"); - } else { - Log::Info("LightGBM using CUDA trainer with SP float!!"); - } -} - -CUDATreeLearner::~CUDATreeLearner() { - #pragma omp parallel for schedule(static, num_gpu_) - - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - - if (device_features_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id])); - } - - if (device_gradients_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id])); - } - - if (device_hessians_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id])); - } - - if (device_feature_masks_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id])); - } - - if (device_data_indices_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); - } - - if (sync_counters_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id])); - } - - if (device_subhistograms_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); - } - - if (device_histogram_outputs_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id])); - } - } -} - - -void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) { - // initialize SerialTreeLearner - SerialTreeLearner::Init(train_data, is_constant_hessian); - - // some additional variables needed for GPU trainer - num_feature_groups_ = train_data_->num_feature_groups(); - - // Initialize GPU buffers and kernels: get device info - InitGPU(config_->num_gpu); -} - -// some functions used for debugging the GPU histogram construction -#if CUDA_DEBUG > 0 - -void PrintHistograms(hist_t* h, size_t size) { - double total_hess = 0; - for (size_t i = 0; i < size; ++i) { - printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i)); - if ((i & 3) == 3) - printf("\n"); - total_hess += GET_HESS(h, i); - } - printf("\nSum hessians: %9.3g\n", total_hess); -} - -union Float_t { - int64_t i; - double f; - static int64_t ulp_diff(Float_t a, Float_t b) { - return abs(a.i - b.i); - } -}; - -int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) { - int i; - int retval = 0; - printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, static_cast(size)); - if (dp_flag) { // double precision - double af, bf; - int64_t ai, bi; - for (i = 0; i < static_cast(size); ++i) { - af = GET_GRAD(h1, i); - bf = GET_GRAD(h2, i); - if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) { - printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf); - ++retval; - } - if (const_flag) { - ai = GET_HESS((reinterpret_cast(h1)), i); - bi = GET_HESS((reinterpret_cast(h2)), i); - if (ai != bi) { - printf("i = %5d, h1.hess %" PRId64 ", h2.hess %" PRId64 "\n", i, ai, bi); - ++retval; - } - } else { - af = GET_HESS(h1, i); - bf = GET_HESS(h2, i); - if (((std::fabs(af - bf))/af) >= 1e-6) { - printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf); - ++retval; - } - } - } - } else { // single precision - float af, bf; - int ai, bi; - for (i = 0; i < static_cast(size); ++i) { - af = GET_GRAD(h1, i); - bf = GET_GRAD(h2, i); - if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) { - printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf); - ++retval; - } - if (const_flag) { - ai = GET_HESS(h1, i); - bi = GET_HESS(h2, i); - if (ai != bi) { - printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi); - ++retval; - } - } else { - af = GET_HESS(h1, i); - bf = GET_HESS(h2, i); - if (((std::fabs(af - bf))/af) >= 1e-5) { - printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf); - ++retval; - } - } - } - } - printf("DONE Comparing Histograms...\n"); - return retval; -} -#endif - -int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) { - // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples. - // also guarantee that there are at least 2K examples per workgroup - double x = 256.0 / num_dense_feature_groups_; - - int exp_workgroups_per_feature = static_cast(ceil(log2(x))); - double t = leaf_num_data / 1024.0; - - Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons\n" - "Best workgroup size per feature for full utilization is %d\n", static_cast(ceil(t)), (1 << exp_workgroups_per_feature)); - - exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast(ceil(log(static_cast(t))/log(2.0)))); - if (exp_workgroups_per_feature < 0) - exp_workgroups_per_feature = 0; - if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature) - exp_workgroups_per_feature = kMaxLogWorkgroupsPerFeature; - - return exp_workgroups_per_feature; -} - -void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) { - // we have already copied ordered gradients, ordered hessians and indices to GPU - // decide the best number of workgroups working on one feature4 tuple - // set work group size based on feature size - // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple - int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data); - std::vector num_gpu_workgroups; - ThreadData *thread_data = reinterpret_cast(_mm_malloc(sizeof(ThreadData) * num_gpu_, 16)); - - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - int num_gpu_feature_groups = num_gpu_feature_groups_[device_id]; - int num_workgroups = (1 << exp_workgroups_per_feature) * num_gpu_feature_groups; - num_gpu_workgroups.push_back(num_workgroups); - if (num_workgroups > preallocd_max_num_wg_[device_id]) { - preallocd_max_num_wg_.at(device_id) = num_workgroups; - CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id])); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast(num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)))); - } - // set thread_data - SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features, - num_workgroups, exp_workgroups_per_feature); - } - - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, reinterpret_cast(&thread_data[device_id]))) { - Log::Fatal("Error in creating threads."); - } - } - - /* Wait for the threads to finish */ - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - if (pthread_join(*(cpu_threads_[device_id]), NULL)) { - Log::Fatal("Error in joining threads."); - } - } - - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - // copy the results asynchronously. Size depends on if double precision is used - - size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(reinterpret_cast(host_histogram_outputs_) + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id])); - } -} - - -template -void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array) { - HistType* hist_outputs = reinterpret_cast(host_histogram_outputs_); - - #pragma omp parallel for schedule(static, num_gpu_) - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - // when the output is ready, the computation is done - CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id])); - } - - HistType* histograms = reinterpret_cast(leaf_histogram_array[0].RawData() - kHistOffset); - #pragma omp parallel for schedule(static) - for (int i = 0; i < num_dense_feature_groups_; ++i) { - if (!feature_masks_[i]) { - continue; - } - int dense_group_index = dense_feature_group_map_[i]; - auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2; - int bin_size = train_data_->FeatureGroupNumBin(dense_group_index); - - for (int j = 0; j < bin_size; ++j) { - GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j); - GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j); - } - } -} - -void CUDATreeLearner::CountDenseFeatureGroups() { - num_dense_feature_groups_ = 0; - - for (int i = 0; i < num_feature_groups_; ++i) { - if (!train_data_->IsMultiGroup(i)) { - num_dense_feature_groups_++; - } - } - if (!num_dense_feature_groups_) { - Log::Warning("GPU acceleration is disabled because no non-trivial dense features can be found"); - } -} - -void CUDATreeLearner::prevAllocateGPUMemory() { - // how many feature-group tuples we have - // leave some safe margin for prefetching - // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature - - allocated_num_data_ = std::max(num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature), allocated_num_data_); - - // clear sparse/dense maps - dense_feature_group_map_.clear(); - sparse_feature_group_map_.clear(); - - // do nothing it there is no dense feature - if (!num_dense_feature_groups_) { - return; - } - - // calculate number of feature groups per gpu - num_gpu_feature_groups_.resize(num_gpu_); - offset_gpu_feature_groups_.resize(num_gpu_); - int num_features_per_gpu = num_dense_feature_groups_ / num_gpu_; - int remain_features = num_dense_feature_groups_ - num_features_per_gpu * num_gpu_; - - int offset = 0; - - for (int i = 0; i < num_gpu_; ++i) { - offset_gpu_feature_groups_.at(i) = offset; - num_gpu_feature_groups_.at(i) = (i < remain_features) ? num_features_per_gpu + 1 : num_features_per_gpu; - offset += num_gpu_feature_groups_.at(i); - } - - feature_masks_.resize(num_dense_feature_groups_); - Log::Debug("Resized feature masks"); - - ptr_pinned_feature_masks_ = feature_masks_.data(); - Log::Debug("Memset pinned_feature_masks_"); - memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_); - - // histogram bin entry size depends on the precision (single/double) - hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t)); // two elements in this "size" - - CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast(&host_histogram_outputs_), static_cast(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable)); - - nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_); - nthreads_ = std::max(nthreads_, 1); -} - -// allocate GPU memory for each GPU -void CUDATreeLearner::AllocateGPUMemory() { - #pragma omp parallel for schedule(static, num_gpu_) - - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - // do nothing it there is no gpu feature - int num_gpu_feature_groups = num_gpu_feature_groups_[device_id]; - if (num_gpu_feature_groups) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - - // allocate memory for all features - if (device_features_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id])); - } - - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]), static_cast(num_gpu_feature_groups * num_data_ * sizeof(uint8_t)))); - Log::Debug("Allocated device_features_ addr=%p sz=%lu", device_features_[device_id], num_gpu_feature_groups * num_data_); - - // allocate space for gradients and hessians on device - // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed - if (device_gradients_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id])); - } - - if (device_hessians_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id])); - } - - if (device_feature_masks_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id])); - } - - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), static_cast(allocated_num_data_ * sizeof(score_t)))); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]), static_cast(allocated_num_data_ * sizeof(score_t)))); - - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), static_cast(num_gpu_feature_groups))); - - // copy indices to the device - if (device_data_indices_[device_id] != NULL) { - CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id])); - } - - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), static_cast(allocated_num_data_ * sizeof(data_size_t)))); - CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id])); - - Log::Debug("Memset device_data_indices_"); - - // create output buffer, each feature has a histogram with device_bin_size_ bins, - // each work group generates a sub-histogram of dword_features_ features. - if (!device_subhistograms_[device_id]) { - // only initialize once here, as this will not need to change when ResetTrainingData() is called - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast(preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2)))); - - Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]); - } - - // create atomic counters for inter-group coordination - CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id])); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), static_cast(num_gpu_feature_groups * sizeof(int)))); - CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id])); - - // The output buffer is allocated to host directly, to overlap compute and data transfer - CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id])); - CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), static_cast(num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_))); - } - } -} - -void CUDATreeLearner::ResetGPUMemory() { - // clear sparse/dense maps - dense_feature_group_map_.clear(); - sparse_feature_group_map_.clear(); -} - -void CUDATreeLearner::copyDenseFeature() { - if (num_feature_groups_ == 0) { - LGBM_config_::current_learner = use_cpu_learner; - return; - } - - Log::Debug("Started copying dense features from CPU to GPU"); - // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes) - size_t copied_feature = 0; - // set device info - int device_id = 0; - uint8_t* device_features = device_features_[device_id]; - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - Log::Debug("Started copying dense features from CPU to GPU - 1"); - - for (int i = 0; i < num_feature_groups_; ++i) { - // looking for dword_features_ non-sparse feature-groups - if (!train_data_->IsMultiGroup(i)) { - dense_feature_group_map_.push_back(i); - auto sizes_in_byte = std::min(train_data_->FeatureGroupSizesInByte(i), static_cast(num_data_)); - void* tmp_data = train_data_->FeatureGroupData(i); - Log::Debug("Started copying dense features from CPU to GPU - 2"); - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id])); - Log::Debug("Started copying dense features from CPU to GPU - 3"); - copied_feature++; - // reset device info - if (copied_feature == static_cast(num_gpu_feature_groups_[device_id])) { - CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id])); - device_id += 1; - copied_feature = 0; - if (device_id < num_gpu_) { - device_features = device_features_[device_id]; - CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id)); - } - } - } else { - sparse_feature_group_map_.push_back(i); - } - } -} - - - -// InitGPU w/ num_gpu -void CUDATreeLearner::InitGPU(int num_gpu) { - // Get the max bin size, used for selecting best GPU kernel - max_num_bin_ = 0; - - #if CUDA_DEBUG >= 1 - printf("bin_size: "); - #endif - for (int i = 0; i < num_feature_groups_; ++i) { - if (train_data_->IsMultiGroup(i)) { - continue; - } - #if CUDA_DEBUG >= 1 - printf("%d, ", train_data_->FeatureGroupNumBin(i)); - #endif - max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i)); - } - #if CUDA_DEBUG >= 1 - printf("\n"); - #endif - - if (max_num_bin_ <= 16) { - device_bin_size_ = 16; - histogram_size_ = 16; - dword_features_ = 1; - } else if (max_num_bin_ <= 64) { - device_bin_size_ = 64; - histogram_size_ = 64; - dword_features_ = 1; - } else if (max_num_bin_ <= 256) { - Log::Debug("device_bin_size_ = 256"); - device_bin_size_ = 256; - histogram_size_ = 256; - dword_features_ = 1; - } else { - Log::Fatal("bin size %d cannot run on GPU", max_num_bin_); - } - - // ignore the feature groups that contain categorical features when producing warnings about max_bin. - // these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin. - int max_num_bin_no_categorical = 0; - int cur_feature_group = 0; - bool categorical_feature_found = false; - for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) { - const int feature_group = train_data_->Feature2Group(inner_feature_index); - const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index); - if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) { - categorical_feature_found = true; - } - if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) { - if (!categorical_feature_found) { - max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group)); - } - categorical_feature_found = false; - cur_feature_group = feature_group; - } - } - if (max_num_bin_no_categorical == 65) { - Log::Warning("Setting max_bin to 63 is suggested for best performance"); - } - if (max_num_bin_no_categorical == 17) { - Log::Warning("Setting max_bin to 15 is suggested for best performance"); - } - - // get num_dense_feature_groups_ - CountDenseFeatureGroups(); - - if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_; - - // initialize GPU - int gpu_count; - - CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count)); - num_gpu_ = (gpu_count < num_gpu) ? gpu_count : num_gpu; - - // set cpu threads - cpu_threads_ = reinterpret_cast(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16)); - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - cpu_threads_[device_id] = reinterpret_cast(_mm_malloc(sizeof(pthread_t), 16)); - } - - // resize device memory pointers - device_features_.resize(num_gpu_); - device_gradients_.resize(num_gpu_); - device_hessians_.resize(num_gpu_); - device_feature_masks_.resize(num_gpu_); - device_data_indices_.resize(num_gpu_); - sync_counters_.resize(num_gpu_); - device_subhistograms_.resize(num_gpu_); - device_histogram_outputs_.resize(num_gpu_); - - // create stream & events to handle multiple GPUs - preallocd_max_num_wg_.resize(num_gpu_, 1024); - stream_.resize(num_gpu_); - hessians_future_.resize(num_gpu_); - gradients_future_.resize(num_gpu_); - indices_future_.resize(num_gpu_); - features_future_.resize(num_gpu_); - kernel_start_.resize(num_gpu_); - kernel_wait_obj_.resize(num_gpu_); - histograms_wait_obj_.resize(num_gpu_); - - for (int i = 0; i < num_gpu_; ++i) { - CUDASUCCESS_OR_FATAL(cudaSetDevice(i)); - CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i]))); - CUDASUCCESS_OR_FATAL(cudaEventCreate(&(hessians_future_[i]))); - CUDASUCCESS_OR_FATAL(cudaEventCreate(&(gradients_future_[i]))); - CUDASUCCESS_OR_FATAL(cudaEventCreate(&(indices_future_[i]))); - CUDASUCCESS_OR_FATAL(cudaEventCreate(&(features_future_[i]))); - CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_start_[i]))); - CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_wait_obj_[i]))); - CUDASUCCESS_OR_FATAL(cudaEventCreate(&(histograms_wait_obj_[i]))); - } - - allocated_num_data_ = 0; - prevAllocateGPUMemory(); - - AllocateGPUMemory(); - - copyDenseFeature(); -} - -Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) { - Tree *ret = SerialTreeLearner::Train(gradients, hessians, is_first_tree); - return ret; -} - -void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) { - // check data size - data_size_t old_allocated_num_data = allocated_num_data_; - - SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin); - - #if ResetTrainingData_DEBUG == 1 - serial_time = std::chrono::steady_clock::now() - start_serial_time; - #endif - - num_feature_groups_ = train_data_->num_feature_groups(); - - // GPU memory has to been reallocated because data may have been changed - #if ResetTrainingData_DEBUG == 1 - auto start_alloc_gpu_time = std::chrono::steady_clock::now(); - #endif - - // AllocateGPUMemory only when the number of data increased - int old_num_feature_groups = num_dense_feature_groups_; - CountDenseFeatureGroups(); - if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) { - prevAllocateGPUMemory(); - AllocateGPUMemory(); - } else { - ResetGPUMemory(); - } - - copyDenseFeature(); - - #if ResetTrainingData_DEBUG == 1 - alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time; - #endif - - // setup GPU kernel arguments after we allocating all the buffers - #if ResetTrainingData_DEBUG == 1 - auto start_set_arg_time = std::chrono::steady_clock::now(); - #endif - - #if ResetTrainingData_DEBUG == 1 - set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time; - reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time; - Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3); - Log::Info("serial_time: %f secs.", serial_time.count() * 1e-3); - Log::Info("alloc_gpu_time: %f secs.", alloc_gpu_time.count() * 1e-3); - Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3); - #endif -} - -void CUDATreeLearner::BeforeTrain() { - #if cudaMemcpy_DEBUG == 1 - std::chrono::duration device_hessians_time = std::chrono::milliseconds(0); - std::chrono::duration device_gradients_time = std::chrono::milliseconds(0); - #endif - - SerialTreeLearner::BeforeTrain(); - - #if CUDA_DEBUG >= 2 - printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n"); - #endif - - // Copy initial full hessians and gradients to GPU. - // We start copying as early as possible, instead of at ConstructHistogram(). - if ((hessians_ != NULL) && (gradients_ != NULL)) { - if (!use_bagging_ && num_dense_feature_groups_) { - Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_); - - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - if (!(share_state_->is_constant_hessian)) { - Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_"); - - #if cudaMemcpy_DEBUG == 1 - auto start_device_hessians_time = std::chrono::steady_clock::now(); - #endif - - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); - - CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); - - #if cudaMemcpy_DEBUG == 1 - device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time; - #endif - - Log::Debug("queued copy of device_hessians_"); - } - - #if cudaMemcpy_DEBUG == 1 - auto start_device_gradients_time = std::chrono::steady_clock::now(); - #endif - - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); - - #if cudaMemcpy_DEBUG == 1 - device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time; - #endif - - Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_"); - } - } - } - - // use bagging - if ((hessians_ != NULL) && (gradients_ != NULL)) { - if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) { - // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram() - // copy used gradients and hessians to ordered buffer - const data_size_t* indices = data_partition_->indices(); - data_size_t cnt = data_partition_->leaf_count(0); - - // transfer the indices to GPU - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); - - if (!(share_state_->is_constant_hessian)) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], const_cast(reinterpret_cast(&(hessians_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id])); - } - - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], const_cast(reinterpret_cast(&(gradients_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id])); - } - } - } -} - -bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) { - int smaller_leaf; - - data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf); - data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf); - - // only have root - if (right_leaf < 0) { - smaller_leaf = -1; - } else if (num_data_in_left_child < num_data_in_right_child) { - smaller_leaf = left_leaf; - } else { - smaller_leaf = right_leaf; - } - - // Copy indices, gradients and hessians as early as possible - if (smaller_leaf >= 0 && num_dense_feature_groups_) { - // only need to initialize for smaller leaf - // Get leaf boundary - const data_size_t* indices = data_partition_->indices(); - data_size_t begin = data_partition_->leaf_begin(smaller_leaf); - data_size_t end = begin + data_partition_->leaf_count(smaller_leaf); - - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); - } - } - - const bool ret = SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf); - - return ret; -} - -bool CUDATreeLearner::ConstructGPUHistogramsAsync( - const std::vector& is_feature_used, - const data_size_t* data_indices, data_size_t num_data) { - if (num_data <= 0) { - return false; - } - - // do nothing if no features can be processed on GPU - if (!num_dense_feature_groups_) { - Log::Debug("no dense feature groups, returning"); - return false; - } - - // copy data indices if it is not null - if (data_indices != nullptr && num_data != num_data_) { - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id])); - CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id])); - } - } - - // converted indices in is_feature_used to feature-group indices - std::vector is_feature_group_used(num_feature_groups_, 0); - - #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048) - for (int i = 0; i < num_features_; ++i) { - if (is_feature_used[i]) { - int feature_group = train_data_->Feature2Group(i); - is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1; - } - } - - // construct the feature masks for dense feature-groups - int used_dense_feature_groups = 0; - #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048) - for (int i = 0; i < num_dense_feature_groups_; ++i) { - if (is_feature_group_used[dense_feature_group_map_[i]]) { - feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]]; - ++used_dense_feature_groups; - } else { - feature_masks_[i] = 0; - } - } - bool use_all_features = ((used_dense_feature_groups == num_dense_feature_groups_) && (data_indices != nullptr)); - // if no feature group is used, just return and do not use GPU - if (used_dense_feature_groups == 0) { - return false; - } - - // if not all feature groups are used, we need to transfer the feature mask to GPU - // otherwise, we will use a specialized GPU kernel with all feature groups enabled - - // We now copy even if all features are used. - #pragma omp parallel for schedule(static, num_gpu_) - for (int device_id = 0; device_id < num_gpu_; ++device_id) { - int offset = offset_gpu_feature_groups_[device_id]; - CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id])); - } - - // All data have been prepared, now run the GPU kernel - GPUHistogram(num_data, use_all_features); - - return true; -} - -void CUDATreeLearner::ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) { - std::vector is_sparse_feature_used(num_features_, 0); - std::vector is_dense_feature_used(num_features_, 0); - int num_dense_features = 0, num_sparse_features = 0; - - #pragma omp parallel for schedule(static) - for (int feature_index = 0; feature_index < num_features_; ++feature_index) { - if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; - if (!is_feature_used[feature_index]) continue; - if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) { - is_sparse_feature_used[feature_index] = 1; - num_sparse_features++; - } else { - is_dense_feature_used[feature_index] = 1; - num_dense_features++; - } - } - - // construct smaller leaf - hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; - - // Check workgroups per feature4 tuple.. - int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf()); - - // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU - if (exp_workgroups_per_feature == 0) { - return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); - } - - // ConstructGPUHistogramsAsync will return true if there are availabe feature groups dispatched to GPU - bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, - nullptr, smaller_leaf_splits_->num_data_in_leaf()); - - // then construct sparse features on CPU - // We set data_indices to null to avoid rebuilding ordered gradients/hessians - if (num_sparse_features > 0) { - train_data_->ConstructHistograms(is_sparse_feature_used, - smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(), - gradients_, hessians_, - ordered_gradients_.data(), ordered_hessians_.data(), - share_state_.get(), - ptr_smaller_leaf_hist_data); - } - - // wait for GPU to finish, only if GPU is actually used - if (is_gpu_used) { - if (config_->gpu_use_dp) { - // use double precision - WaitAndGetHistograms(smaller_leaf_histogram_array_); - } else { - // use single precision - WaitAndGetHistograms(smaller_leaf_histogram_array_); - } - } - - // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem - // #define CUDA_DEBUG_COMPARE -#ifdef CUDA_DEBUG_COMPARE - printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_); - bool compare = true; - for (int i = 0; i < num_dense_feature_groups_; ++i) { - if (!feature_masks_[i]) - continue; - int dense_feature_group_index = dense_feature_group_map_[i]; - size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index); - hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset; - hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2; - hist_t* gpu_histogram = new hist_t[size * 2]; - data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf(); - printf("Comparing histogram for feature %d, num_data %d, num_data_ = %d, %lu bins\n", dense_feature_group_index, num_data, num_data_, size); - std::copy(current_histogram, current_histogram + size * 2, gpu_histogram); - std::memset(current_histogram, 0, size * sizeof(hist_t) * 2); - if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) { - continue; - } - if (num_data == num_data_) { - if (share_state_->is_constant_hessian) { - printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian\n"); - train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( - 0, - num_data, - gradients_, - current_histogram); - } else { - printf("ConstructHistogram(): num_data == num_data_\n"); - train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( - 0, - num_data, - gradients_, hessians_, - current_histogram); - } - } else { - if (share_state_->is_constant_hessian) { - printf("ConstructHistogram(): is_constant_hessian\n"); - train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( - smaller_leaf_splits_->data_indices(), - 0, - num_data, - gradients_, - current_histogram); - } else { - printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_); - train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram( - smaller_leaf_splits_->data_indices(), - 0, - num_data, - gradients_, hessians_, - current_histogram); - } - } - int retval; - if ((num_data != num_data_) && compare) { - retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian); - printf("CompareHistograms reports %d errors\n", retval); - compare = false; - } - retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian); - if (num_data == num_data_) { - printf("CompareHistograms reports %d errors\n", retval); - } else { - printf("CompareHistograms reports %d errors\n", retval); - } - std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram); - delete [] gpu_histogram; - } - printf("End Comparing Histogram between GPU and CPU\n"); - fflush(stderr); - fflush(stdout); -#endif - - if (larger_leaf_histogram_array_ != nullptr && !use_subtract) { - // construct larger leaf - hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset; - - is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used, - larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf()); - - // then construct sparse features on CPU - // We set data_indices to null to avoid rebuilding ordered gradients/hessians - if (num_sparse_features > 0) { - train_data_->ConstructHistograms(is_sparse_feature_used, - larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(), - gradients_, hessians_, - ordered_gradients_.data(), ordered_hessians_.data(), - share_state_.get(), - ptr_larger_leaf_hist_data); - } - - // wait for GPU to finish, only if GPU is actually used - if (is_gpu_used) { - if (config_->gpu_use_dp) { - // use double precision - WaitAndGetHistograms(larger_leaf_histogram_array_); - } else { - // use single precision - WaitAndGetHistograms(larger_leaf_histogram_array_); - } - } - } -} - -void CUDATreeLearner::FindBestSplits(const Tree* tree) { - SerialTreeLearner::FindBestSplits(tree); - -#if CUDA_DEBUG >= 3 - for (int feature_index = 0; feature_index < num_features_; ++feature_index) { - if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue; - if (parent_leaf_histogram_array_ != nullptr - && !parent_leaf_histogram_array_[feature_index].is_splittable()) { - smaller_leaf_histogram_array_[feature_index].set_is_splittable(false); - continue; - } - size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1; - printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size); - PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); - if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; } - printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size); - - PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size); - } -#endif -} - -void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) { - const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf]; -#if CUDA_DEBUG >= 2 - printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian); -#endif - SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf); - if (Network::num_machines() == 1) { - // do some sanity check for the GPU algorithm - if (best_split_info.left_count < best_split_info.right_count) { - if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) || - (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) { - Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); - } - } else { - if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) || - (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) { - Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf()); - } - } - } -} - -} // namespace LightGBM -#undef cudaMemcpy_DEBUG -#endif // USE_CUDA diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h deleted file mode 100644 index b0e6fb3b4628..000000000000 --- a/src/treelearner/cuda_tree_learner.h +++ /dev/null @@ -1,261 +0,0 @@ -/*! - * Copyright (c) 2020 IBM Corporation. All rights reserved. - * Licensed under the MIT License. See LICENSE file in the project root for license information. - */ -#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ -#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#ifdef USE_CUDA -#include -#endif - -#include "feature_histogram.hpp" -#include "serial_tree_learner.h" -#include "data_partition.hpp" -#include "split_info.hpp" -#include "leaf_splits.hpp" - -#ifdef USE_CUDA -#include -#include "cuda_kernel_launcher.h" - - -using json11::Json; - -namespace LightGBM { - -/*! -* \brief CUDA-based parallel learning algorithm. -*/ -class CUDATreeLearner: public SerialTreeLearner { - public: - explicit CUDATreeLearner(const Config* tree_config); - ~CUDATreeLearner(); - void Init(const Dataset* train_data, bool is_constant_hessian) override; - void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override; - Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override; - void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override { - SerialTreeLearner::SetBaggingData(subset, used_indices, num_data); - if (subset == nullptr && used_indices != nullptr) { - if (num_data != num_data_) { - use_bagging_ = true; - return; - } - } - use_bagging_ = false; - } - - protected: - void BeforeTrain() override; - bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override; - void FindBestSplits(const Tree* tree) override; - void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override; - void ConstructHistograms(const std::vector& is_feature_used, bool use_subtract) override; - - private: - typedef float gpu_hist_t; - - /*! - * \brief Find the best number of workgroups processing one feature for maximizing efficiency - * \param leaf_num_data The number of data examples on the current leaf being processed - * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature - */ - int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data); - - /*! - * \brief Initialize GPU device - * \param num_gpu: number of maximum gpus - */ - void InitGPU(int num_gpu); - - /*! - * \brief Allocate memory for GPU computation // alloc only - */ - void CountDenseFeatureGroups(); // compute num_dense_feature_group - void prevAllocateGPUMemory(); // compute CPU-side param calculation & Pin HostMemory - void AllocateGPUMemory(); - - /*! - * \ ResetGPUMemory - */ - void ResetGPUMemory(); - - /*! - * \ copy dense feature from CPU to GPU - */ - void copyDenseFeature(); - - /*! - * \brief Compute GPU feature histogram for the current leaf. - * Indices, gradients and Hessians have been copied to the device. - * \param leaf_num_data Number of data on current leaf - * \param use_all_features Set to true to not use feature masks, with a faster kernel - */ - void GPUHistogram(data_size_t leaf_num_data, bool use_all_features); - - void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size, - int leaf_num_data, bool use_all_features, - int num_workgroups, int exp_workgroups_per_feature) { - ThreadData* td = &thread_data[device_id]; - td->device_id = device_id; - td->histogram_size = histogram_size; - td->leaf_num_data = leaf_num_data; - td->num_data = num_data_; - td->use_all_features = use_all_features; - td->is_constant_hessian = share_state_->is_constant_hessian; - td->num_workgroups = num_workgroups; - td->stream = stream_[device_id]; - td->device_features = device_features_[device_id]; - td->device_feature_masks = reinterpret_cast(device_feature_masks_[device_id]); - td->device_data_indices = device_data_indices_[device_id]; - td->device_gradients = device_gradients_[device_id]; - td->device_hessians = device_hessians_[device_id]; - td->hessians_const = hessians_[0]; - td->device_subhistograms = device_subhistograms_[device_id]; - td->sync_counters = sync_counters_[device_id]; - td->device_histogram_outputs = device_histogram_outputs_[device_id]; - td->exp_workgroups_per_feature = exp_workgroups_per_feature; - - td->kernel_start = &(kernel_start_[device_id]); - td->kernel_wait_obj = &(kernel_wait_obj_[device_id]); - td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]); - - size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_; - td->output_size = output_size; - td->host_histogram_output = reinterpret_cast(host_histogram_outputs_) + host_output_offset; - td->histograms_wait_obj = &(histograms_wait_obj_[device_id]); - } - - /*! - * \brief Wait for GPU kernel execution and read histogram - * \param histograms Destination of histogram results from GPU. - */ - template - void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array); - - /*! - * \brief Construct GPU histogram asynchronously. - * Interface is similar to Dataset::ConstructHistograms(). - * \param is_feature_used A predicate vector for enabling each feature - * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU. - * Set to nullptr to skip copy to GPU. - * \param num_data Number of data examples to be included in histogram - * \return true if GPU kernel is launched, false if GPU is not used - */ - bool ConstructGPUHistogramsAsync( - const std::vector& is_feature_used, - const data_size_t* data_indices, data_size_t num_data); - - /*! brief Log2 of max number of workgroups per feature*/ - const int kMaxLogWorkgroupsPerFeature = 10; // 2^10 - /*! brief Max total number of workgroups with preallocated workspace. - * If we use more than this number of workgroups, we have to reallocate subhistograms */ - std::vector preallocd_max_num_wg_; - - /*! \brief True if bagging is used */ - bool use_bagging_; - - /*! \brief GPU command queue object */ - std::vector stream_; - - /*! \brief total number of feature-groups */ - int num_feature_groups_; - /*! \brief total number of dense feature-groups, which will be processed on GPU */ - int num_dense_feature_groups_; - std::vector num_gpu_feature_groups_; - std::vector offset_gpu_feature_groups_; - /*! \brief On GPU we read one DWORD (4-byte) of features of one example once. - * With bin size > 16, there are 4 features per DWORD. - * With bin size <=16, there are 8 features per DWORD. - */ - int dword_features_; - /*! \brief Max number of bins of training data, used to determine - * which GPU kernel to use */ - int max_num_bin_; - /*! \brief Used GPU kernel bin size (64, 256) */ - int histogram_size_; - int device_bin_size_; - /*! \brief Size of histogram bin entry, depending if single or double precision is used */ - size_t hist_bin_entry_sz_; - /*! \brief Indices of all dense feature-groups */ - std::vector dense_feature_group_map_; - /*! \brief Indices of all sparse feature-groups */ - std::vector sparse_feature_group_map_; - /*! \brief GPU memory object holding the training data */ - std::vector device_features_; - /*! \brief GPU memory object holding the ordered gradient */ - std::vector device_gradients_; - /*! \brief GPU memory object holding the ordered hessian */ - std::vector device_hessians_; - /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */ - std::vector feature_masks_; - /*! \brief GPU memory object holding the feature masks */ - std::vector device_feature_masks_; - /*! \brief Pointer to pinned memory of feature masks */ - char* ptr_pinned_feature_masks_ = nullptr; - /*! \brief GPU memory object holding indices of the leaf being processed */ - std::vector device_data_indices_; - /*! \brief GPU memory object holding counters for workgroup coordination */ - std::vector sync_counters_; - /*! \brief GPU memory object holding temporary sub-histograms per workgroup */ - std::vector device_subhistograms_; - /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */ - std::vector device_histogram_outputs_; - /*! \brief Host memory pointer for histogram outputs */ - void *host_histogram_outputs_; - /*! CUDA waitlist object for waiting for data transfer before kernel execution */ - std::vector kernel_wait_obj_; - /*! CUDA waitlist object for reading output histograms after kernel execution */ - std::vector histograms_wait_obj_; - /*! CUDA Asynchronous waiting object for copying indices */ - std::vector indices_future_; - /*! Asynchronous waiting object for copying gradients */ - std::vector gradients_future_; - /*! Asynchronous waiting object for copying Hessians */ - std::vector hessians_future_; - /*! Asynchronous waiting object for copying dense features */ - std::vector features_future_; - - // host-side buffer for converting feature data into featre4 data - int nthreads_; // number of Feature4* vector on host4_vecs_ - std::vector kernel_start_; - std::vector kernel_time_; // measure histogram kernel time - std::vector> kernel_input_wait_time_; - int num_gpu_; - int allocated_num_data_; // allocated data instances - pthread_t **cpu_threads_; // pthread, 1 cpu thread / gpu -}; - -} // namespace LightGBM -#else // USE_CUDA - -// When GPU support is not compiled in, quit with an error message - -namespace LightGBM { - -class CUDATreeLearner: public SerialTreeLearner { - public: - #pragma warning(disable : 4702) - explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) { - Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" - "Please recompile with CMake option -DUSE_CUDA=1"); - } -}; - -} // namespace LightGBM - -#endif // USE_CUDA -#endif // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_ diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp index 7a21ed0691c3..5ca8a3f047f6 100644 --- a/src/treelearner/serial_tree_learner.cpp +++ b/src/treelearner/serial_tree_learner.cpp @@ -344,15 +344,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set* fo } bool use_subtract = parent_leaf_histogram_array_ != nullptr; -#ifdef USE_CUDA - if (LGBM_config_::current_learner == use_cpu_learner) { - SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract); - } else { - ConstructHistograms(is_feature_used, use_subtract); - } -#else ConstructHistograms(is_feature_used, use_subtract); -#endif FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree); } From a9103a35b42f466b28aaa1d7b281abf3a0a34ebd Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 16 Jan 2023 23:00:22 -0600 Subject: [PATCH 05/17] comment out CI --- .appveyor.yml | 44 ---- .github/workflows/r_package.yml | 350 -------------------------------- .vsts-ci.yml | 138 ++++++------- 3 files changed, 69 insertions(+), 463 deletions(-) delete mode 100644 .appveyor.yml delete mode 100644 .github/workflows/r_package.yml diff --git a/.appveyor.yml b/.appveyor.yml deleted file mode 100644 index 2d279b0f33e3..000000000000 --- a/.appveyor.yml +++ /dev/null @@ -1,44 +0,0 @@ -version: 3.3.3.99.{build} - -image: Visual Studio 2015 -platform: x64 -configuration: # a trick to construct a build matrix with multiple Python versions - - '3.7' - -# only build pull requests and -# commits to 'master' or any branch starting with 'release' -branches: - only: - - master - - /^release/ - -environment: - matrix: - - COMPILER: MSVC - TASK: python - - COMPILER: MINGW - TASK: python - -clone_depth: 5 - -install: - - git submodule update --init --recursive # get `external_libs` folder - - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% # delete sh.exe from PATH (mingw32-make fix) - - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH% - - set PYTHON_VERSION=%CONFIGURATION% - - set CONDA_ENV="test-env" - - ps: | - $env:MINICONDA = "C:\Miniconda3-x64" - $env:PATH = "$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH" - $env:BUILD_SOURCESDIRECTORY = "$env:APPVEYOR_BUILD_FOLDER" - $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim() - -build: false - -test_script: - - conda config --remove channels defaults - - conda config --add channels nodefaults - - conda config --add channels conda-forge - - conda config --set channel_priority strict - - conda init powershell - - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1 diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml deleted file mode 100644 index 1574786cac6f..000000000000 --- a/.github/workflows/r_package.yml +++ /dev/null @@ -1,350 +0,0 @@ -name: R-package - -on: - push: - branches: - - master - pull_request: - branches: - - master - - release/* - -# automatically cancel in-progress builds if another commit is pushed -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -env: - # hack to get around this: - # https://stat.ethz.ch/pipermail/r-package-devel/2020q3/005930.html - _R_CHECK_SYSTEM_CLOCK_: 0 - # ignore R CMD CHECK NOTE checking how long it has - # been since the last submission - _R_CHECK_CRAN_INCOMING_REMOTE_: 0 - # CRAN ignores the "installed size is too large" NOTE, - # so our CI can too. Setting to a large value here just - # to catch extreme problems - _R_CHECK_PKG_SIZES_THRESHOLD_: 100 - -jobs: - test: - name: ${{ matrix.task }} (${{ matrix.os }}, ${{ matrix.compiler }}, R ${{ matrix.r_version }}, ${{ matrix.build_type }}) - runs-on: ${{ matrix.os }} - container: ${{ matrix.container }} - timeout-minutes: 60 - strategy: - fail-fast: false - matrix: - include: - ################ - # CMake builds # - ################ - - os: ubuntu-latest - task: r-package - compiler: gcc - r_version: 3.6 - build_type: cmake - container: 'ubuntu:18.04' - - os: ubuntu-latest - task: r-package - compiler: gcc - r_version: 4.2 - build_type: cmake - container: 'ubuntu:22.04' - - os: ubuntu-latest - task: r-package - compiler: clang - r_version: 3.6 - build_type: cmake - container: 'ubuntu:18.04' - - os: ubuntu-latest - task: r-package - compiler: clang - r_version: 4.2 - build_type: cmake - container: 'ubuntu:22.04' - - os: macOS-latest - task: r-package - compiler: gcc - r_version: 3.6 - build_type: cmake - container: null - - os: macOS-latest - task: r-package - compiler: gcc - r_version: 4.2 - build_type: cmake - container: null - - os: macOS-latest - task: r-package - compiler: clang - r_version: 3.6 - build_type: cmake - container: null - - os: macOS-latest - task: r-package - compiler: clang - r_version: 4.2 - build_type: cmake - container: null - - os: windows-latest - task: r-package - compiler: MINGW - toolchain: MINGW - r_version: 3.6 - build_type: cmake - container: null - - os: windows-latest - task: r-package - compiler: MINGW - toolchain: MSYS - r_version: 4.2 - build_type: cmake - container: null - # Visual Studio 2019 - - os: windows-2019 - task: r-package - compiler: MSVC - toolchain: MSVC - r_version: 3.6 - build_type: cmake - container: null - # Visual Studio 2022 - - os: windows-2022 - task: r-package - compiler: MSVC - toolchain: MSVC - r_version: 4.2 - build_type: cmake - container: null - ############### - # CRAN builds # - ############### - - os: windows-latest - task: r-package - compiler: MINGW - toolchain: MINGW - r_version: 3.6 - build_type: cran - container: null - - os: windows-latest - task: r-package - compiler: MINGW - toolchain: MSYS - r_version: 4.2 - build_type: cran - container: null - - os: ubuntu-latest - task: r-package - compiler: gcc - r_version: 4.2 - build_type: cran - container: 'ubuntu:22.04' - - os: macOS-latest - task: r-package - compiler: clang - r_version: 4.2 - build_type: cran - container: null - ################ - # Other checks # - ################ - - os: ubuntu-latest - task: r-rchk - compiler: gcc - r_version: 4.2 - build_type: cran - container: 'ubuntu:22.04' - steps: - - name: Prevent conversion of line endings on Windows - if: startsWith(matrix.os, 'windows') - shell: pwsh - run: git config --global core.autocrlf false - - name: Install packages used by third-party actions - if: startsWith(matrix.os, 'ubuntu') - shell: bash - run: | - apt-get update -y - apt-get install --no-install-recommends -y \ - ca-certificates \ - dirmngr \ - gpg \ - gpg-agent \ - software-properties-common \ - sudo - # install newest version of git - # ref: - # - https://unix.stackexchange.com/a/170831/550004 - # - https://git-scm.com/download/linux - add-apt-repository ppa:git-core/ppa -y - apt-get update -y - apt-get install --no-install-recommends -y \ - git - - name: Trust git cloning LightGBM - if: startsWith(matrix.os, 'ubuntu') - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 5 - submodules: true - - name: Install pandoc - uses: r-lib/actions/setup-pandoc@v2 - - name: install tinytex - if: startsWith(matrix.os, 'windows') - uses: r-lib/actions/setup-tinytex@v2 - env: - CTAN_MIRROR: https://ctan.math.illinois.edu/systems/win32/miktex - TINYTEX_INSTALLER: TinyTeX - - name: Setup and run tests on Linux and macOS - if: matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest' - shell: bash - run: | - export TASK="${{ matrix.task }}" - export COMPILER="${{ matrix.compiler }}" - export GITHUB_ACTIONS="true" - if [[ "${{ matrix.os }}" == "macOS-latest" ]]; then - export OS_NAME="macos" - elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then - export OS_NAME="linux" - export IN_UBUNTU_BASE_CONTAINER="true" - # the default version of cmake provided on Ubuntu 18.04 (v3.10.2), is not supported by LightGBM - # see https://github.com/microsoft/LightGBM/issues/5642 - if [[ "${{ matrix.container }}" == "ubuntu:18.04" ]]; then - export INSTALL_CMAKE_FROM_RELEASES="true" - fi - fi - export BUILD_DIRECTORY="$GITHUB_WORKSPACE" - export R_VERSION="${{ matrix.r_version }}" - export R_BUILD_TYPE="${{ matrix.build_type }}" - $GITHUB_WORKSPACE/.ci/setup.sh - $GITHUB_WORKSPACE/.ci/test.sh - - name: Setup and run tests on Windows - if: startsWith(matrix.os, 'windows') - shell: pwsh -command ". {0}" - run: | - $env:BUILD_SOURCESDIRECTORY = $env:GITHUB_WORKSPACE - $env:LGB_VER = (Get-Content -TotalCount 1 $env:BUILD_SOURCESDIRECTORY\VERSION.txt).trim().replace('rc', '-') - $env:TOOLCHAIN = "${{ matrix.toolchain }}" - $env:R_VERSION = "${{ matrix.r_version }}" - $env:R_BUILD_TYPE = "${{ matrix.build_type }}" - $env:COMPILER = "${{ matrix.compiler }}" - $env:GITHUB_ACTIONS = "true" - $env:TASK = "${{ matrix.task }}" - & "$env:GITHUB_WORKSPACE/.ci/test_windows.ps1" - test-r-sanitizers: - name: r-sanitizers (ubuntu-latest, R-devel, ${{ matrix.compiler }} ASAN/UBSAN) - timeout-minutes: 60 - runs-on: ubuntu-latest - container: wch1/r-debug - strategy: - fail-fast: false - matrix: - include: - - r_customization: san - compiler: gcc - - r_customization: csan - compiler: clang - steps: - - name: Trust git cloning LightGBM - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 5 - submodules: true - - name: Install packages - shell: bash - run: | - RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" - sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }} - RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1 - - name: Run tests with sanitizers - shell: bash - run: | - cd R-package/tests - exit_code=0 - RDscript${{ matrix.r_customization }} testthat.R >> tests.log 2>&1 || exit_code=-1 - cat ./tests.log - exit ${exit_code} - test-r-debian-clang: - name: r-package (debian, R-devel, clang) - timeout-minutes: 60 - runs-on: ubuntu-latest - container: rhub/debian-clang-devel - steps: - - name: Install Git before checkout - shell: bash - run: | - apt-get update --allow-releaseinfo-change - apt-get install --no-install-recommends -y git - - name: Trust git cloning LightGBM - run: | - git config --global --add safe.directory "${GITHUB_WORKSPACE}" - - name: Checkout repository - uses: actions/checkout@v3 - with: - fetch-depth: 5 - submodules: true - - name: update to clang 15 - shell: bash - run: | - # remove clang stuff that comes installed in the image - apt-get autoremove -y --purge \ - clang-* \ - libclang-* \ - libunwind-* \ - llvm-* - # - # replace it all with clang-15 - apt-get update -y - apt-get install --no-install-recommends -y \ - gnupg \ - lsb-release \ - software-properties-common \ - wget - # - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - - # - add-apt-repository "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-15 main" - apt-get install -y --no-install-recommends \ - clang-15 \ - clangd-15 \ - clang-format-15 \ - clang-tidy-15 \ - clang-tools-15 \ - lldb-15 \ - lld-15 \ - llvm-15-dev \ - llvm-15-tools \ - libomp-15-dev \ - libc++-15-dev \ - libc++abi-15-dev \ - libclang-common-15-dev \ - libclang-15-dev \ - libclang-cpp15-dev \ - libunwind-15-dev - # overwrite everything in /usr/bin with the new v15 versions - cp --remove-destination /usr/lib/llvm-15/bin/* /usr/bin/ - - name: Install packages and run tests - shell: bash - run: | - export PATH=/opt/R-devel/bin/:${PATH} - Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" - sh build-cran-package.sh - R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1 - if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then - echo "NOTEs, WARNINGs, or ERRORs have been found by R CMD check" - exit -1 - fi - all-r-package-jobs-successful: - if: always() - runs-on: ubuntu-latest - needs: [test, test-r-sanitizers, test-r-debian-clang] - steps: - - name: Note that all tests succeeded - uses: re-actors/alls-green@v1.2.2 - with: - jobs: ${{ toJSON(needs) }} diff --git a/.vsts-ci.yml b/.vsts-ci.yml index 194aa5471131..26c687b0a20f 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -143,74 +143,74 @@ jobs: displayName: Setup - bash: $(Build.SourcesDirectory)/.ci/test.sh displayName: Test -########################################### -- job: QEMU_multiarch -########################################### - variables: - COMPILER: gcc - OS_NAME: 'linux' - PRODUCES_ARTIFACTS: 'true' - pool: - vmImage: ubuntu-22.04 - timeoutInMinutes: 180 - strategy: - matrix: - bdist: - TASK: bdist - ARCH: aarch64 - steps: - - script: | - sudo apt-get update - sudo apt-get install --no-install-recommends -y \ - binfmt-support \ - qemu \ - qemu-user \ - qemu-user-static - displayName: 'Install QEMU' - - script: | - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes - displayName: 'Enable Docker multi-architecture support' - - script: | - export ROOT_DOCKER_FOLDER=/LightGBM - cat > docker.env < docker-script.sh < docker.env < docker-script.sh < Date: Mon, 16 Jan 2023 23:50:17 -0600 Subject: [PATCH 06/17] remove more references --- include/LightGBM/bin.h | 4 +- include/LightGBM/cuda/cuda_algorithms.hpp | 4 +- include/LightGBM/cuda/cuda_column_data.hpp | 4 +- include/LightGBM/cuda/cuda_metadata.hpp | 4 +- include/LightGBM/cuda/cuda_metric.hpp | 4 +- .../LightGBM/cuda/cuda_objective_function.hpp | 4 +- include/LightGBM/cuda/cuda_random.hpp | 4 +- include/LightGBM/cuda/cuda_row_data.hpp | 4 +- include/LightGBM/cuda/cuda_split_info.hpp | 4 +- include/LightGBM/cuda/cuda_tree.hpp | 4 +- include/LightGBM/cuda/cuda_utils.h | 13 +--- include/LightGBM/cuda/vector_cudahost.h | 36 ++++----- include/LightGBM/dataset.h | 16 ++-- include/LightGBM/objective_function.h | 4 +- include/LightGBM/sample_strategy.h | 10 +-- include/LightGBM/train_share_states.h | 16 ++-- include/LightGBM/tree.h | 8 +- python-package/README.rst | 2 +- src/application/application.cpp | 2 +- src/boosting/bagging.hpp | 32 ++++---- src/boosting/cuda/cuda_score_updater.cpp | 4 +- src/boosting/cuda/cuda_score_updater.cu | 4 +- src/boosting/cuda/cuda_score_updater.hpp | 4 +- src/boosting/gbdt.cpp | 78 +++++++++---------- src/boosting/gbdt.h | 12 +-- src/boosting/goss.hpp | 20 ++--- src/cuda/cuda_algorithms.cu | 4 +- src/cuda/cuda_utils.cpp | 4 +- src/io/bin.cpp | 4 +- src/io/config.cpp | 21 ++--- src/io/cuda/cuda_column_data.cpp | 4 +- src/io/cuda/cuda_column_data.cu | 4 +- src/io/cuda/cuda_metadata.cpp | 4 +- src/io/cuda/cuda_row_data.cpp | 4 +- src/io/cuda/cuda_tree.cpp | 4 +- src/io/cuda/cuda_tree.cu | 4 +- src/io/dataset.cpp | 47 +++++------ src/io/dataset_loader.cpp | 12 +-- src/io/dense_bin.hpp | 2 +- src/io/metadata.cpp | 24 +++--- src/io/multi_val_dense_bin.hpp | 4 +- src/io/multi_val_sparse_bin.hpp | 4 +- src/io/train_share_states.cpp | 4 +- src/io/tree.cpp | 8 +- src/metric/cuda/cuda_binary_metric.cpp | 4 +- src/metric/cuda/cuda_binary_metric.hpp | 4 +- src/metric/cuda/cuda_pointwise_metric.cpp | 4 +- src/metric/cuda/cuda_pointwise_metric.cu | 4 +- src/metric/cuda/cuda_pointwise_metric.hpp | 4 +- src/metric/cuda/cuda_regression_metric.cpp | 4 +- src/metric/cuda/cuda_regression_metric.hpp | 4 +- src/metric/metric.cpp | 50 ++++++------ src/objective/cuda/cuda_binary_objective.cpp | 4 +- src/objective/cuda/cuda_binary_objective.cu | 4 +- src/objective/cuda/cuda_binary_objective.hpp | 4 +- .../cuda/cuda_multiclass_objective.cpp | 4 +- .../cuda/cuda_multiclass_objective.cu | 4 +- .../cuda/cuda_multiclass_objective.hpp | 4 +- src/objective/cuda/cuda_rank_objective.cpp | 4 +- src/objective/cuda/cuda_rank_objective.cu | 4 +- src/objective/cuda/cuda_rank_objective.hpp | 4 +- .../cuda/cuda_regression_objective.cpp | 4 +- .../cuda/cuda_regression_objective.cu | 4 +- .../cuda/cuda_regression_objective.hpp | 4 +- .../cuda/cuda_best_split_finder.cpp | 4 +- .../cuda/cuda_best_split_finder.cu | 4 +- .../cuda/cuda_best_split_finder.hpp | 4 +- src/treelearner/cuda/cuda_data_partition.cpp | 4 +- src/treelearner/cuda/cuda_data_partition.cu | 4 +- src/treelearner/cuda/cuda_data_partition.hpp | 4 +- .../cuda/cuda_histogram_constructor.cpp | 4 +- .../cuda/cuda_histogram_constructor.cu | 4 +- .../cuda/cuda_histogram_constructor.hpp | 4 +- src/treelearner/cuda/cuda_leaf_splits.cpp | 4 +- src/treelearner/cuda/cuda_leaf_splits.cu | 4 +- src/treelearner/cuda/cuda_leaf_splits.hpp | 4 +- .../cuda/cuda_single_gpu_tree_learner.cpp | 4 +- .../cuda/cuda_single_gpu_tree_learner.cu | 4 +- .../cuda/cuda_single_gpu_tree_learner.hpp | 10 +-- src/treelearner/serial_tree_learner.h | 2 +- src/treelearner/tree_learner.cpp | 14 +--- tests/python_package_test/test_basic.py | 2 +- tests/python_package_test/test_dask.py | 2 +- tests/python_package_test/test_engine.py | 14 ++-- tests/python_package_test/test_sklearn.py | 6 +- tests/python_package_test/test_utilities.py | 8 +- 86 files changed, 331 insertions(+), 372 deletions(-) diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h index 71b60c493504..705d83f29e0c 100644 --- a/include/LightGBM/bin.h +++ b/include/LightGBM/bin.h @@ -480,13 +480,13 @@ class MultiValBin { virtual MultiValBin* Clone() = 0; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA virtual const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const = 0; - #endif // USE_CUDA_EXP + #endif // USE_CUDA }; inline uint32_t BinMapper::ValueToBin(double value) const { diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp index b302af8b63c1..ab3328bb5561 100644 --- a/include/LightGBM/cuda/cuda_algorithms.hpp +++ b/include/LightGBM/cuda/cuda_algorithms.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ #define LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -577,5 +577,5 @@ __device__ VAL_T PercentileDevice(const VAL_T* values, } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_ diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp index 0252669fe457..6668c92f2921 100644 --- a/include/LightGBM/cuda/cuda_column_data.hpp +++ b/include/LightGBM/cuda/cuda_column_data.hpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #ifndef LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_ #define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_ @@ -137,4 +137,4 @@ class CUDAColumnData { #endif // LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_ -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp index 6919d9723fb1..bc7339a84bf7 100644 --- a/include/LightGBM/cuda/cuda_metadata.hpp +++ b/include/LightGBM/cuda/cuda_metadata.hpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_ #define LIGHTGBM_CUDA_CUDA_METADATA_HPP_ @@ -55,4 +55,4 @@ class CUDAMetadata { #endif // LIGHTGBM_CUDA_CUDA_METADATA_HPP_ -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_metric.hpp b/include/LightGBM/cuda/cuda_metric.hpp index caeff267e8ef..5eb04c81c777 100644 --- a/include/LightGBM/cuda/cuda_metric.hpp +++ b/include/LightGBM/cuda/cuda_metric.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_CUDA_CUDA_METRIC_HPP_ #define LIGHTGBM_CUDA_CUDA_METRIC_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -36,6 +36,6 @@ class CUDAMetricInterface: public HOST_METRIC { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_CUDA_CUDA_METRIC_HPP_ diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp index 1010895e9d7a..dacaf252f8e6 100644 --- a/include/LightGBM/cuda/cuda_objective_function.hpp +++ b/include/LightGBM/cuda/cuda_objective_function.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_ #define LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -73,6 +73,6 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_ diff --git a/include/LightGBM/cuda/cuda_random.hpp b/include/LightGBM/cuda/cuda_random.hpp index 1f07d64452da..6c28e44bc2d3 100644 --- a/include/LightGBM/cuda/cuda_random.hpp +++ b/include/LightGBM/cuda/cuda_random.hpp @@ -5,7 +5,7 @@ #ifndef LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ #define LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -69,6 +69,6 @@ class CUDARandom { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_ diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp index 3950a5328553..a7e487dae9f0 100644 --- a/include/LightGBM/cuda/cuda_row_data.hpp +++ b/include/LightGBM/cuda/cuda_row_data.hpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #ifndef LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_ #define LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_ @@ -176,4 +176,4 @@ class CUDARowData { } // namespace LightGBM #endif // LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_ -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp index 5c525b431548..46b35ca37a59 100644 --- a/include/LightGBM/cuda/cuda_split_info.hpp +++ b/include/LightGBM/cuda/cuda_split_info.hpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #ifndef LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_ #define LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_ @@ -102,4 +102,4 @@ class CUDASplitInfo { #endif // LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_ -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp index d557798270e0..e2836baa2be5 100644 --- a/include/LightGBM/cuda/cuda_tree.hpp +++ b/include/LightGBM/cuda/cuda_tree.hpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #ifndef LIGHTGBM_CUDA_CUDA_TREE_HPP_ #define LIGHTGBM_CUDA_CUDA_TREE_HPP_ @@ -170,4 +170,4 @@ class CUDATree : public Tree { #endif // LIGHTGBM_CUDA_CUDA_TREE_HPP_ -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h index d5b94bc89e4a..771e1561f767 100644 --- a/include/LightGBM/cuda/cuda_utils.h +++ b/include/LightGBM/cuda/cuda_utils.h @@ -6,20 +6,15 @@ #ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_ #define LIGHTGBM_CUDA_CUDA_UTILS_H_ -#if defined(USE_CUDA) || defined(USE_CUDA_EXP) +#ifdef USE_CUDA #include #include #include #include -#endif // USE_CUDA || USE_CUDA_EXP - -#ifdef USE_CUDA_EXP #include -#endif // USE_CUDA_EXP namespace LightGBM { -#if defined(USE_CUDA) || defined(USE_CUDA_EXP) #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { if (code != cudaSuccess) { @@ -27,9 +22,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = if (abort) exit(code); } } -#endif // USE_CUDA || USE_CUDA_EXP -#ifdef USE_CUDA_EXP #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); } void SetCUDADevice(int gpu_device_id, const char* file, int line); @@ -184,8 +177,8 @@ class CUDAVector { size_t size_; }; -#endif // USE_CUDA_EXP - } // namespace LightGBM +#endif // USE_CUDA + #endif // LIGHTGBM_CUDA_CUDA_UTILS_H_ diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 7c6e219cbbd9..6262a3bc9113 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -5,12 +5,11 @@ #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ #define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ +#ifdef USE_CUDA #include -#if defined(USE_CUDA) || defined(USE_CUDA_EXP) #include #include -#endif #include enum LGBM_Device { @@ -43,44 +42,36 @@ struct CHAllocator { T* ptr; if (n == 0) return NULL; n = SIZE_ALIGNED(n); - #if defined(USE_CUDA) || defined(USE_CUDA_EXP) - if (LGBM_config_::current_device == lgbm_device_cuda) { + if (LGBM_config_::current_device == lgbm_device_cuda) { cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); if (ret != cudaSuccess) { - Log::Warning("Defaulting to malloc in CHAllocator!!!"); - ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); + Log::Warning("Defaulting to malloc in CHAllocator!!!"); + ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); } - } else { + } else { ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); - } - #else - ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); - #endif + } return ptr; } void deallocate(T* p, std::size_t n) { (void)n; // UNUSED if (p == NULL) return; - #if defined(USE_CUDA) || defined(USE_CUDA_EXP) - if (LGBM_config_::current_device == lgbm_device_cuda) { + if (LGBM_config_::current_device == lgbm_device_cuda) { cudaPointerAttributes attributes; cudaPointerGetAttributes(&attributes, p); #if CUDA_VERSION >= 10000 - if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { + if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { cudaFreeHost(p); - } + } #else - if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { + if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { cudaFreeHost(p); - } + } #endif - } else { + } else { _mm_free(p); - } - #else - _mm_free(p); - #endif + } } }; template @@ -90,4 +81,5 @@ bool operator!=(const CHAllocator&, const CHAllocator&); } // namespace LightGBM +#endif // USE_CUDA #endif // LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h index 74e3e9c1dad4..97bc146c9a9e 100644 --- a/include/LightGBM/dataset.h +++ b/include/LightGBM/dataset.h @@ -277,13 +277,13 @@ class Metadata { /*! \brief Disable copy */ Metadata(const Metadata&) = delete; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); } void CreateCUDAMetadata(const int gpu_device_id); - #endif // USE_CUDA_EXP + #endif // USE_CUDA private: /*! \brief Load wights from file */ @@ -329,9 +329,9 @@ class Metadata { bool weight_load_from_file_; bool query_load_from_file_; bool init_score_load_from_file_; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA std::unique_ptr cuda_metadata_; - #endif // USE_CUDA_EXP + #endif // USE_CUDA }; @@ -910,13 +910,13 @@ class Dataset { return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index); } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA const CUDAColumnData* cuda_column_data() const { return cuda_column_data_.get(); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA private: void CreateCUDAColumnData(); @@ -968,9 +968,9 @@ class Dataset { /*! \brief mutex for threading safe call */ std::mutex mutex_; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA std::unique_ptr cuda_column_data_; - #endif // USE_CUDA_EXP + #endif // USE_CUDA std::string parser_config_str_; }; diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h index 376a6f1a071d..ad188dc39676 100644 --- a/include/LightGBM/objective_function.h +++ b/include/LightGBM/objective_function.h @@ -97,7 +97,7 @@ class ObjectiveFunction { */ virtual bool IsCUDAObjective() const { return false; } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA /*! * \brief Convert output for CUDA version */ @@ -107,7 +107,7 @@ class ObjectiveFunction { virtual bool NeedConvertOutputCUDA () const { return false; } - #endif // USE_CUDA_EXP + #endif // USE_CUDA }; } // namespace LightGBM diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h index 765632f7ecbf..51d3cbc16f23 100644 --- a/include/LightGBM/sample_strategy.h +++ b/include/LightGBM/sample_strategy.h @@ -38,9 +38,9 @@ class SampleStrategy { std::vector>& bag_data_indices() { return bag_data_indices_; } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA CUDAVector& cuda_bag_data_indices() { return cuda_bag_data_indices_; } - #endif // USE_CUDA_EXP + #endif // USE_CUDA void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) { objective_function_ = objective_function; @@ -72,10 +72,10 @@ class SampleStrategy { /*! \brief whether need to resize the gradient vectors */ bool need_resize_gradients_; - #ifdef USE_CUDA_EXP - /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */ + #ifdef USE_CUDA + /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda */ CUDAVector cuda_bag_data_indices_; - #endif // USE_CUDA_EXP + #endif // USE_CUDA }; } // namespace LightGBM diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h index 5c14c7d51a47..8c50734695b2 100644 --- a/include/LightGBM/train_share_states.h +++ b/include/LightGBM/train_share_states.h @@ -126,7 +126,7 @@ class MultiValBinWrapper { } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA const void* GetRowWiseData( uint8_t* bit_type, size_t* total_size, @@ -142,7 +142,7 @@ class MultiValBinWrapper { return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type); } } - #endif // USE_CUDA_EXP + #endif // USE_CUDA private: bool is_use_subcol_ = false; @@ -183,9 +183,9 @@ struct TrainingShareStates { const std::vector& feature_hist_offsets() const { return feature_hist_offsets_; } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA const std::vector& column_hist_offsets() const { return column_hist_offsets_; } - #endif // USE_CUDA_EXP + #endif // USE_CUDA bool IsSparseRowwise() { return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse()); @@ -235,7 +235,7 @@ struct TrainingShareStates { } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, @@ -250,13 +250,13 @@ struct TrainingShareStates { return nullptr; } } - #endif // USE_CUDA_EXP + #endif // USE_CUDA private: std::vector feature_hist_offsets_; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA std::vector column_hist_offsets_; - #endif // USE_CUDA_EXP + #endif // USE_CUDA int num_hist_total_bin_ = 0; std::unique_ptr multi_val_bin_wrapper_; std::vector> hist_buf_; diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h index 3e403b16e89b..13b3c41a2309 100644 --- a/include/LightGBM/tree.h +++ b/include/LightGBM/tree.h @@ -319,9 +319,9 @@ class Tree { inline bool is_linear() const { return is_linear_; } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA inline bool is_cuda_tree() const { return is_cuda_tree_; } - #endif // USE_CUDA_EXP + #endif // USE_CUDA inline void SetIsLinear(bool is_linear) { is_linear_ = is_linear; @@ -532,10 +532,10 @@ class Tree { std::vector> leaf_features_; /* \brief features used in leaf linear models; indexing is relative to used_features_ */ std::vector> leaf_features_inner_; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA /*! \brief Marks whether this tree is a CUDATree */ bool is_cuda_tree_; - #endif // USE_CUDA_EXP + #endif // USE_CUDA }; inline void Tree::Split(int leaf, int feature, int real_feature, diff --git a/python-package/README.rst b/python-package/README.rst index 4494eb43a172..8021add44c63 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -125,7 +125,7 @@ All requirements from `Build from Sources section <#build-from-sources>`__ apply Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``--cuda`` with ``--cuda-exp`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. Note that this new version uses twice the memory, since it stores data row-wise as well as column-wise in memory to improve performance (see this `issue `__ for discussion). -To use the regular or experimental CUDA versions within Python, pass ``{"device": "cuda"}`` or ``{"device": "cuda_exp"}`` respectively as parameters. +To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters. Build HDFS Version ~~~~~~~~~~~~~~~~~~ diff --git a/src/application/application.cpp b/src/application/application.cpp index d69efb5017ae..3e51136afc96 100644 --- a/src/application/application.cpp +++ b/src/application/application.cpp @@ -36,7 +36,7 @@ Application::Application(int argc, char** argv) { Log::Fatal("No training/prediction data, application quit"); } - if (config_.device_type == std::string("cuda") || config_.device_type == std::string("cuda_exp")) { + if (config_.device_type == std::string("cuda")) { LGBM_config_::current_device = lgbm_device_cuda; } } diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp index 65a937435105..4c2c81553e7c 100644 --- a/src/boosting/bagging.hpp +++ b/src/boosting/bagging.hpp @@ -47,33 +47,33 @@ class BaggingSampleStrategy : public SampleStrategy { Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_); // set bagging data to tree learner if (!is_use_subset_) { - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { CopyFromHostToCUDADevice(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast(num_data_), __FILE__, __LINE__); tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_); } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } else { // get subset tmp_subset_->ReSize(bag_data_cnt_); tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { CopyFromHostToCUDADevice(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast(num_data_), __FILE__, __LINE__); tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(), bag_data_cnt_); } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), bag_data_cnt_); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } } } @@ -103,11 +103,11 @@ class BaggingSampleStrategy : public SampleStrategy { bag_data_cnt_ = static_cast(config_->bagging_fraction * num_data_); } bag_data_indices_.resize(num_data_); - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { cuda_bag_data_indices_.Resize(num_data_); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA bagging_runner_.ReSize(num_data_); bagging_rands_.clear(); for (int i = 0; @@ -118,7 +118,7 @@ class BaggingSampleStrategy : public SampleStrategy { double average_bag_rate = (static_cast(bag_data_cnt_) / num_data_) / config_->bagging_freq; is_use_subset_ = false; - if (config_->device_type != std::string("cuda_exp")) { + if (config_->device_type != std::string("cuda")) { const int group_threshold_usesubset = 100; const double average_bag_rate_threshold = 0.5; if (average_bag_rate <= average_bag_rate_threshold @@ -141,9 +141,9 @@ class BaggingSampleStrategy : public SampleStrategy { } else { bag_data_cnt_ = num_data_; bag_data_indices_.clear(); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA cuda_bag_data_indices_.Clear(); - #endif // USE_CUDA_EXP + #endif // USE_CUDA bagging_runner_.ReSize(0); is_use_subset_ = false; } diff --git a/src/boosting/cuda/cuda_score_updater.cpp b/src/boosting/cuda/cuda_score_updater.cpp index 9c514265ee40..d29fe9f4e394 100644 --- a/src/boosting/cuda/cuda_score_updater.cpp +++ b/src/boosting/cuda/cuda_score_updater.cpp @@ -5,7 +5,7 @@ #include "cuda_score_updater.hpp" -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA namespace LightGBM { @@ -91,4 +91,4 @@ inline void CUDAScoreUpdater::MultiplyScore(double val, int cur_tree_id) { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/boosting/cuda/cuda_score_updater.cu b/src/boosting/cuda/cuda_score_updater.cu index c2138957f199..a1fd3638d20a 100644 --- a/src/boosting/cuda/cuda_score_updater.cu +++ b/src/boosting/cuda/cuda_score_updater.cu @@ -5,7 +5,7 @@ #include "cuda_score_updater.hpp" -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA namespace LightGBM { @@ -42,4 +42,4 @@ void CUDAScoreUpdater::LaunchMultiplyScoreConstantKernel(const double val, const } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/boosting/cuda/cuda_score_updater.hpp b/src/boosting/cuda/cuda_score_updater.hpp index 01326ab1d354..ec728777e66c 100644 --- a/src/boosting/cuda/cuda_score_updater.hpp +++ b/src/boosting/cuda/cuda_score_updater.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_ #define LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -60,6 +60,6 @@ class CUDAScoreUpdater: public ScoreUpdater { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_ diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index 72e16ee7e707..fbaf337eea60 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -68,14 +68,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective es_first_metric_only_ = config_->first_metric_only; shrinkage_rate_ = config_->learning_rate; - if (config_->device_type == std::string("cuda") || config_->device_type == std::string("cuda_exp")) { + if (config_->device_type == std::string("cuda")) { LGBM_config_::current_learner = use_cuda_learner; - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { - const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); - } - #endif // USE_CUDA_EXP + #ifdef USE_CUDA + const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + #endif // USE_CUDA } // load forced_splits file @@ -116,15 +114,15 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective } training_metrics_.shrink_to_fit(); - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_)); } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA num_data_ = train_data_->num_data(); @@ -186,11 +184,11 @@ void GBDT::AddValidDataset(const Dataset* valid_data, } // for a validation dataset, we need its score and metric auto new_score_updater = - #ifdef USE_CUDA_EXP - config_->device_type == std::string("cuda_exp") ? + #ifdef USE_CUDA + config_->device_type == std::string("cuda") ? std::unique_ptr(new CUDAScoreUpdater(valid_data, num_tree_per_iteration_, objective_function_ != nullptr && objective_function_->IsCUDAObjective())) : - #endif // USE_CUDA_EXP + #endif // USE_CUDA std::unique_ptr(new ScoreUpdater(valid_data, num_tree_per_iteration_)); // update score for (int i = 0; i < iter_; ++i) { @@ -481,15 +479,15 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); // we need to predict out-of-bag scores of data for boosting if (num_data_ - bag_data_cnt > 0) { - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { train_score_updater_->AddScore(tree, data_sample_strategy_->cuda_bag_data_indices().RawData() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id); } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA train_score_updater_->AddScore(tree, data_sample_strategy_->bag_data_indices().data() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } } else { @@ -503,17 +501,17 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) { } } -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const { #else std::vector GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t /*num_data*/) const { -#endif // USE_CUDA_EXP - #ifdef USE_CUDA_EXP +#endif // USE_CUDA + #ifdef USE_CUDA const bool evaluation_on_cuda = metric->IsCUDAMetric(); if ((boosting_on_gpu_ && evaluation_on_cuda) || (!boosting_on_gpu_ && !evaluation_on_cuda)) { - #endif // USE_CUDA_EXP + #endif // USE_CUDA return metric->Eval(score, objective_function_); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } else if (boosting_on_gpu_ && !evaluation_on_cuda) { const size_t total_size = static_cast(num_data) * static_cast(num_tree_per_iteration_); if (total_size > host_score_.size()) { @@ -529,7 +527,7 @@ std::vector GBDT::EvalOneMetric(const Metric* metric, const double* scor CopyFromHostToCUDADevice(cuda_score_.RawData(), score, total_size, __FILE__, __LINE__); return metric->Eval(cuda_score_.RawData(), objective_function_); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } std::string GBDT::OutputMetric(int iter) { @@ -660,14 +658,14 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) { num_data = valid_score_updater_[used_idx]->num_data(); *out_len = static_cast(num_data) * num_class_; } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA std::vector host_raw_scores; if (boosting_on_gpu_) { host_raw_scores.resize(static_cast(*out_len), 0.0); CopyFromCUDADeviceToHost(host_raw_scores.data(), raw_scores, static_cast(*out_len), __FILE__, __LINE__); raw_scores = host_raw_scores.data(); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA if (objective_function_ != nullptr) { #pragma omp parallel for schedule(static) for (data_size_t i = 0; i < num_data; ++i) { @@ -730,26 +728,26 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction* } training_metrics_.shrink_to_fit(); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() && !data_sample_strategy_->IsHessianChange(); // for sample strategy with Hessian change, fall back to boosting on CPU tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_); - #endif // USE_CUDA_EXP + #endif // USE_CUDA if (train_data != train_data_) { train_data_ = train_data; data_sample_strategy_->UpdateTrainingData(train_data); // not same training data, need reset score and others // create score tracker - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_)); } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_)); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA // update score for (int i = 0; i < iter_; ++i) { @@ -827,8 +825,8 @@ void GBDT::ResetGradientBuffers() { const bool is_use_subset = data_sample_strategy_->is_use_subset(); const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt(); if (objective_function_ != nullptr) { - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda") && boosting_on_gpu_) { if (cuda_gradients_.Size() < total_size) { cuda_gradients_.Resize(total_size); cuda_hessians_.Resize(total_size); @@ -836,16 +834,16 @@ void GBDT::ResetGradientBuffers() { gradients_pointer_ = cuda_gradients_.RawData(); hessians_pointer_ = cuda_hessians_.RawData(); } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA if (gradients_.size() < total_size) { gradients_.resize(total_size); hessians_.resize(total_size); } gradients_pointer_ = gradients_.data(); hessians_pointer_ = hessians_.data(); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } else if (data_sample_strategy_->IsHessianChange() || (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_)) { if (gradients_.size() < total_size) { gradients_.resize(total_size); diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index d71245980b36..1f784b94e593 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -542,7 +542,7 @@ class GBDT : public GBDTBase { /*! \brief Parser config file content */ std::string parser_config_str_ = ""; -#if defined(USE_CUDA) || defined(USE_CUDA_EXP) +#ifdef USE_CUDA /*! \brief First order derivative of training data */ std::vector> gradients_; /*! \brief Second order derivative of training data */ @@ -557,18 +557,18 @@ class GBDT : public GBDTBase { score_t* gradients_pointer_; /*! \brief Pointer to hessian vector, can be on CPU or GPU */ score_t* hessians_pointer_; - /*! \brief Whether boosting is done on GPU, used for cuda_exp */ + /*! \brief Whether boosting is done on GPU, used for cuda */ bool boosting_on_gpu_; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA /*! \brief Gradient vector on GPU */ CUDAVector cuda_gradients_; /*! \brief Hessian vector on GPU */ CUDAVector cuda_hessians_; - /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */ + /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda */ mutable std::vector host_score_; - /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda_exp */ + /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda */ mutable CUDAVector cuda_score_; - #endif // USE_CUDA_EXP + #endif // USE_CUDA /*! \brief Number of training data */ data_size_t num_data_; diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp index 34b099e051bb..9d55d89ee097 100644 --- a/src/boosting/goss.hpp +++ b/src/boosting/goss.hpp @@ -43,33 +43,33 @@ class GOSSStrategy : public SampleStrategy { bag_data_cnt_ = left_cnt; // set bagging data to tree learner if (!is_use_subset_) { - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { CopyFromHostToCUDADevice(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast(num_data_), __FILE__, __LINE__); tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_); } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } else { // get subset tmp_subset_->ReSize(bag_data_cnt_); tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(), bag_data_cnt_, false); - #ifdef USE_CUDA_EXP - if (config_->device_type == std::string("cuda_exp")) { + #ifdef USE_CUDA + if (config_->device_type == std::string("cuda")) { CopyFromHostToCUDADevice(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast(num_data_), __FILE__, __LINE__); tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(), bag_data_cnt_); } else { - #endif // USE_CUDA_EXP + #endif // USE_CUDA tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(), bag_data_cnt_); - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } } diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu index 5a6b3eb74ef0..19c1507419e9 100644 --- a/src/cuda/cuda_algorithms.cu +++ b/src/cuda/cuda_algorithms.cu @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -509,4 +509,4 @@ template __device__ double PercentileDevice @@ -28,4 +28,4 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/io/bin.cpp b/src/io/bin.cpp index a5430c483d3b..652b874c94d8 100644 --- a/src/io/bin.cpp +++ b/src/io/bin.cpp @@ -886,7 +886,7 @@ namespace LightGBM { return nullptr; } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA template <> const void* MultiValDenseBin::GetRowWiseData(uint8_t* bit_type, size_t* total_size, @@ -1081,6 +1081,6 @@ namespace LightGBM { return to_return; } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } // namespace LightGBM diff --git a/src/io/config.cpp b/src/io/config.cpp index 8827414c2e99..ba2672fcaef4 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -178,7 +178,13 @@ void GetDeviceType(const std::unordered_map& params, s } else if (value == std::string("cuda")) { *device_type = "cuda"; } else if (value == std::string("cuda_exp")) { - *device_type = "cuda_exp"; + Log::Warning( + "Found device_type='cuda_exp' passed through params. " + "That is an alias for device_type='cuda'. " + "Use device_type='cuda' to suppress this warning. " + "In the future, this warning will become an error. " + ); + *device_type = "cuda"; } else { Log::Fatal("Unknown device type %s", value.c_str()); } @@ -260,7 +266,7 @@ void Config::Set(const std::unordered_map& params) { GetObjectiveType(params, &objective); GetMetricType(params, objective, &metric); GetDeviceType(params, &device_type); - if (device_type == std::string("cuda") || device_type == std::string("cuda_exp")) { + if (device_type == std::string("cuda")) { LGBM_config_::current_device = lgbm_device_cuda; } GetTreeLearnerType(params, &tree_learner); @@ -373,26 +379,21 @@ void Config::CheckParamConflict() { num_leaves = static_cast(full_num_leaves); } } - if (device_type == std::string("gpu") || device_type == std::string("cuda")) { + if (device_type == std::string("gpu")) { // force col-wise for gpu, and cuda version force_col_wise = true; force_row_wise = false; if (deterministic) { Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); } - } else if (device_type == std::string("cuda_exp")) { - // force row-wise for cuda_exp version + } else if (device_type == std::string("cuda")) { + // force row-wise for cuda version force_col_wise = false; force_row_wise = true; if (deterministic) { Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic."); } } - // force gpu_use_dp for CUDA - if (device_type == std::string("cuda") && !gpu_use_dp) { - Log::Warning("CUDA currently requires double precision calculations."); - gpu_use_dp = true; - } // linear tree learner must be serial type and run on CPU device if (linear_tree) { if (device_type != std::string("cpu")) { diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp index c4b0bb62e584..a1080cb2b902 100644 --- a/src/io/cuda/cuda_column_data.cpp +++ b/src/io/cuda/cuda_column_data.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -308,4 +308,4 @@ void CUDAColumnData::InitColumnMetaInfo() { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu index 3ab70e9a5758..75ff6234e09e 100644 --- a/src/io/cuda/cuda_column_data.cu +++ b/src/io/cuda/cuda_column_data.cu @@ -4,7 +4,7 @@ */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -58,4 +58,4 @@ void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column) } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp index 2a3dd380254a..02cd42cf805f 100644 --- a/src/io/cuda/cuda_metadata.cpp +++ b/src/io/cuda/cuda_metadata.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -89,4 +89,4 @@ void CUDAMetadata::SetInitScore(const double* init_score, data_size_t len) { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp index 3c66a164d9d4..30bbb273193a 100644 --- a/src/io/cuda/cuda_row_data.cpp +++ b/src/io/cuda/cuda_row_data.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -474,4 +474,4 @@ template const uint64_t* CUDARowData::GetPartitionPtr() const; } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp index 196563340ae5..923e51961e0b 100644 --- a/src/io/cuda/cuda_tree.cpp +++ b/src/io/cuda/cuda_tree.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -337,4 +337,4 @@ void CUDATree::AsConstantTree(double val) { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu index 2a6448259d7f..62020c3a09ae 100644 --- a/src/io/cuda/cuda_tree.cu +++ b/src/io/cuda/cuda_tree.cu @@ -4,7 +4,7 @@ */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -456,4 +456,4 @@ void CUDATree::LaunchAddPredictionToScoreKernel( } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index a237e79b2680..061bdd2464bd 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -345,9 +345,9 @@ void Dataset::Construct(std::vector>* bin_mappers, auto features_in_group = OneFeaturePerGroup(used_features); auto is_sparse = io_config.is_enable_sparse; - if (io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) { + if (io_config.device_type == std::string("cuda")) { LGBM_config_::current_device = lgbm_device_cuda; - if ((io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) && is_sparse) { + if ((io_config.device_type == std::string("cuda")) && is_sparse) { Log::Warning("Using sparse features with CUDA is currently not supported."); is_sparse = false; } @@ -355,8 +355,7 @@ void Dataset::Construct(std::vector>* bin_mappers, std::vector group_is_multi_val(used_features.size(), 0); if (io_config.enable_bundle && !used_features.empty()) { - bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda") - || io_config.device_type == std::string("cuda_exp"); + bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda"); features_in_group = FastFeatureBundling( *bin_mappers, sample_non_zero_indices, sample_values, num_per_col, num_sample_col, static_cast(total_sample_cnt), @@ -447,14 +446,10 @@ void Dataset::FinishLoad() { } metadata_.FinishLoad(); - #ifdef USE_CUDA_EXP - if (device_type_ == std::string("cuda_exp")) { - CreateCUDAColumnData(); - metadata_.CreateCUDAMetadata(gpu_device_id_); - } else { - cuda_column_data_.reset(nullptr); - } - #endif // USE_CUDA_EXP + #ifdef USE_CUDA + CreateCUDAColumnData(); + metadata_.CreateCUDAMetadata(gpu_device_id_); + #endif // USE_CUDA is_finish_load_ = true; } @@ -862,15 +857,13 @@ void Dataset::CopySubrow(const Dataset* fullset, device_type_ = fullset->device_type_; gpu_device_id_ = fullset->gpu_device_id_; - #ifdef USE_CUDA_EXP - if (device_type_ == std::string("cuda_exp")) { - if (cuda_column_data_ == nullptr) { - cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_)); - metadata_.CreateCUDAMetadata(gpu_device_id_); - } - cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); + #ifdef USE_CUDA + if (cuda_column_data_ == nullptr) { + cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_)); + metadata_.CreateCUDAMetadata(gpu_device_id_); } - #endif // USE_CUDA_EXP + cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); + #endif // USE_CUDA } bool Dataset::SetFloatField(const char* field_name, const float* field_data, @@ -1508,13 +1501,9 @@ void Dataset::AddFeaturesFrom(Dataset* other) { raw_data_.push_back(other->raw_data_[i]); } } - #ifdef USE_CUDA_EXP - if (device_type_ == std::string("cuda_exp")) { - CreateCUDAColumnData(); - } else { - cuda_column_data_ = nullptr; - } - #endif // USE_CUDA_EXP + #ifdef USE_CUDA + CreateCUDAColumnData(); + #endif // USE_CUDA } const void* Dataset::GetColWiseData( @@ -1536,7 +1525,7 @@ const void* Dataset::GetColWiseData( return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator); } -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA void Dataset::CreateCUDAColumnData() { cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_)); int num_columns = 0; @@ -1671,6 +1660,6 @@ void Dataset::CreateCUDAColumnData() { feature_to_column); } -#endif // USE_CUDA_EXP +#endif // USE_CUDA } // namespace LightGBM diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 246424600b03..e384af9e0792 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -279,14 +279,10 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac dataset->device_type_ = config_.device_type; dataset->gpu_device_id_ = config_.gpu_device_id; - #ifdef USE_CUDA_EXP - if (config_.device_type == std::string("cuda_exp")) { - dataset->CreateCUDAColumnData(); - dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_); - } else { - dataset->cuda_column_data_ = nullptr; - } - #endif // USE_CUDA_EXP + #ifdef USE_CUDA + dataset->CreateCUDAColumnData(); + dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_); + #endif // USE_CUDA } // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp index 5d95d9dc6073..0ebcdc1a6181 100644 --- a/src/io/dense_bin.hpp +++ b/src/io/dense_bin.hpp @@ -467,7 +467,7 @@ class DenseBin : public Bin { private: data_size_t num_data_; -#if defined(USE_CUDA) || defined(USE_CUDA_EXP) +#ifdef USE_CUDA std::vector> data_; #else std::vector> data_; diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp index 053d1b43c104..07e9701b1ca6 100644 --- a/src/io/metadata.cpp +++ b/src/io/metadata.cpp @@ -18,9 +18,9 @@ Metadata::Metadata() { weight_load_from_file_ = false; query_load_from_file_ = false; init_score_load_from_file_ = false; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA cuda_metadata_ = nullptr; - #endif // USE_CUDA_EXP + #endif // USE_CUDA } void Metadata::Init(const char* data_filename) { @@ -344,11 +344,11 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) { init_score_[i] = Common::AvoidInf(init_score[i]); } init_score_load_from_file_ = false; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { cuda_metadata_->SetInitScore(init_score_.data(), len); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) { @@ -387,11 +387,11 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) { for (data_size_t i = 0; i < num_data_; ++i) { label_[i] = Common::AvoidInf(label[i]); } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { cuda_metadata_->SetLabel(label_.data(), len); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data_size_t len) { @@ -428,11 +428,11 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) { } CalculateQueryWeights(); weight_load_from_file_ = false; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { cuda_metadata_->SetWeights(weights_.data(), len); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, data_size_t len) { @@ -477,7 +477,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) { } CalculateQueryWeights(); query_load_from_file_ = false; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA if (cuda_metadata_ != nullptr) { if (query_weights_.size() > 0) { CHECK_EQ(query_weights_.size(), static_cast(num_queries_)); @@ -486,7 +486,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) { cuda_metadata_->SetQuery(query_boundaries_.data(), nullptr, num_queries_); } } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } void Metadata::InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len) { @@ -635,12 +635,12 @@ void Metadata::FinishLoad() { CalculateQueryBoundaries(); } -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA void Metadata::CreateCUDAMetadata(const int gpu_device_id) { cuda_metadata_.reset(new CUDAMetadata(gpu_device_id)); cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_); } -#endif // USE_CUDA_EXP +#endif // USE_CUDA void Metadata::LoadFromMemory(const void* memory) { const char* mem_ptr = reinterpret_cast(memory); diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp index 8de9cf305952..b4fbfbe673aa 100644 --- a/src/io/multi_val_dense_bin.hpp +++ b/src/io/multi_val_dense_bin.hpp @@ -211,13 +211,13 @@ class MultiValDenseBin : public MultiValBin { MultiValDenseBin* Clone() override; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const override; - #endif // USE_CUDA_EXP + #endif // USE_CUDA private: data_size_t num_data_; diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp index 80acbb681ab6..eaa30ef0a0cc 100644 --- a/src/io/multi_val_sparse_bin.hpp +++ b/src/io/multi_val_sparse_bin.hpp @@ -292,13 +292,13 @@ class MultiValSparseBin : public MultiValBin { MultiValSparseBin* Clone() override; - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA const void* GetRowWiseData(uint8_t* bit_type, size_t* total_size, bool* is_sparse, const void** out_data_ptr, uint8_t* data_ptr_bit_type) const override; - #endif // USE_CUDA_EXP + #endif // USE_CUDA private: data_size_t num_data_; diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp index 199424733f80..f6462697a93d 100644 --- a/src/io/train_share_states.cpp +++ b/src/io/train_share_states.cpp @@ -382,9 +382,9 @@ void TrainingShareStates::CalcBinOffsets(const std::vector(feature_hist_offsets_.back()); } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA column_hist_offsets_ = *offsets; - #endif // USE_CUDA_EXP + #endif // USE_CUDA } void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data, diff --git a/src/io/tree.cpp b/src/io/tree.cpp index 39b5c23d4d1c..ce45d20cf454 100644 --- a/src/io/tree.cpp +++ b/src/io/tree.cpp @@ -53,9 +53,9 @@ Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear) leaf_features_.resize(max_leaves_); leaf_features_inner_.resize(max_leaves_); } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA is_cuda_tree_ = false; - #endif // USE_CUDA_EXP + #endif // USE_CUDA } int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin, @@ -731,9 +731,9 @@ Tree::Tree(const char* str, size_t* used_len) { is_linear_ = false; } - #ifdef USE_CUDA_EXP + #ifdef USE_CUDA is_cuda_tree_ = false; - #endif // USE_CUDA_EXP + #endif // USE_CUDA if ((num_leaves_ <= 1) && !is_linear_) { return; diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp index d526fddeecb2..cedf909b9892 100644 --- a/src/metric/cuda/cuda_binary_metric.cpp +++ b/src/metric/cuda/cuda_binary_metric.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_binary_metric.hpp" @@ -28,4 +28,4 @@ std::vector CUDABinaryMetricInterface::Eval(co } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp index ae50dac381dd..72d9edc8b40e 100644 --- a/src/metric/cuda/cuda_binary_metric.hpp +++ b/src/metric/cuda/cuda_binary_metric.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ #define LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -52,6 +52,6 @@ class CUDABinaryLoglossMetric: public CUDABinaryMetricInterface @@ -66,4 +66,4 @@ template void CUDAPointwiseMetricInterface #include @@ -38,6 +38,6 @@ class CUDAPointwiseMetricInterface: public CUDAMetricInterface { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_ diff --git a/src/metric/cuda/cuda_regression_metric.cpp b/src/metric/cuda/cuda_regression_metric.cpp index 15b219160a30..5e61214ad1e8 100644 --- a/src/metric/cuda/cuda_regression_metric.cpp +++ b/src/metric/cuda/cuda_regression_metric.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -31,4 +31,4 @@ CUDAL2Metric::CUDAL2Metric(const Config& config): CUDARegressionMetricInterface< } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/metric/cuda/cuda_regression_metric.hpp b/src/metric/cuda/cuda_regression_metric.hpp index 342e49542eb4..6e9d44a6b046 100644 --- a/src/metric/cuda/cuda_regression_metric.hpp +++ b/src/metric/cuda/cuda_regression_metric.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_ #define LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -54,6 +54,6 @@ class CUDAL2Metric : public CUDARegressionMetricInterface @@ -206,4 +206,4 @@ void CUDABinaryLogloss::LaunchResetOVACUDALabelKernel() const { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp index 77f58d8318f1..c87654921cad 100644 --- a/src/objective/cuda/cuda_binary_objective.hpp +++ b/src/objective/cuda/cuda_binary_objective.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #define GET_GRADIENTS_BLOCK_SIZE_BINARY (1024) #define CALC_INIT_SCORE_BLOCK_SIZE_BINARY (1024) @@ -58,6 +58,6 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp index 2ea3de870e99..55ff694bf4f7 100644 --- a/src/objective/cuda/cuda_multiclass_objective.cpp +++ b/src/objective/cuda/cuda_multiclass_objective.cpp @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_multiclass_objective.hpp" @@ -59,4 +59,4 @@ const double* CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, c } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_multiclass_objective.cu b/src/objective/cuda/cuda_multiclass_objective.cu index 797c7cec7bf0..b9687ed430e2 100644 --- a/src/objective/cuda/cuda_multiclass_objective.cu +++ b/src/objective/cuda/cuda_multiclass_objective.cu @@ -3,7 +3,7 @@ * Licensed under the MIT License. See LICENSE file in the project root for license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -105,4 +105,4 @@ const double* CUDAMulticlassSoftmax::LaunchConvertOutputCUDAKernel( } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_multiclass_objective.hpp b/src/objective/cuda/cuda_multiclass_objective.hpp index e6e326306e31..328ae6515516 100644 --- a/src/objective/cuda/cuda_multiclass_objective.hpp +++ b/src/objective/cuda/cuda_multiclass_objective.hpp @@ -5,7 +5,7 @@ #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -74,5 +74,5 @@ class CUDAMulticlassOVA: public CUDAObjectiveInterface { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp index 50ea16e80799..227498ed39af 100644 --- a/src/objective/cuda/cuda_rank_objective.cpp +++ b/src/objective/cuda/cuda_rank_objective.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -64,4 +64,4 @@ void CUDARankXENDCG::GenerateItemRands() const { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu index 43a39fdcbaf8..af9f595f1aed 100644 --- a/src/objective/cuda/cuda_rank_objective.cu +++ b/src/objective/cuda/cuda_rank_objective.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_rank_objective.hpp" @@ -658,4 +658,4 @@ void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* grad } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp index f922c240bd28..a7f047017d88 100644 --- a/src/objective/cuda/cuda_rank_objective.hpp +++ b/src/objective/cuda/cuda_rank_objective.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #define NUM_QUERY_PER_BLOCK (10) @@ -118,5 +118,5 @@ class CUDARankXENDCG : public CUDALambdaRankObjectiveInterface { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_ diff --git a/src/objective/cuda/cuda_regression_objective.cpp b/src/objective/cuda/cuda_regression_objective.cpp index 91d7b8059da6..beb030721ae3 100644 --- a/src/objective/cuda/cuda_regression_objective.cpp +++ b/src/objective/cuda/cuda_regression_objective.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_regression_objective.hpp" @@ -85,4 +85,4 @@ double CUDARegressionPoissonLoss::LaunchCalcInitScoreKernel(const int class_id) } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu index 99feec132508..28fc3168e41a 100644 --- a/src/objective/cuda/cuda_regression_objective.cu +++ b/src/objective/cuda/cuda_regression_objective.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_regression_objective.hpp" #include @@ -353,4 +353,4 @@ const double* CUDARegressionPoissonLoss::LaunchConvertOutputCUDAKernel(const dat } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/objective/cuda/cuda_regression_objective.hpp b/src/objective/cuda/cuda_regression_objective.hpp index 593fcf1cfcb6..e2eb1abbd006 100644 --- a/src/objective/cuda/cuda_regression_objective.hpp +++ b/src/objective/cuda/cuda_regression_objective.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_REGRESSION_OBJECTIVE_HPP_ #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_REGRESSION_OBJECTIVE_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #define GET_GRADIENTS_BLOCK_SIZE_REGRESSION (1024) @@ -135,5 +135,5 @@ class CUDARegressionPoissonLoss : public CUDARegressionObjectiveInterface @@ -383,4 +383,4 @@ void CUDABestSplitFinder::SetUsedFeatureByNode(const std::vector& is_fea } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu index 04896c40e7a9..3fee5562953c 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.cu +++ b/src/treelearner/cuda/cuda_best_split_finder.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -1802,4 +1802,4 @@ void CUDABestSplitFinder::LaunchInitCUDARandomKernel() { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp index e9c12922cde6..69f8169f8d85 100644 --- a/src/treelearner/cuda/cuda_best_split_finder.hpp +++ b/src/treelearner/cuda/cuda_best_split_finder.hpp @@ -7,7 +7,7 @@ #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_ #define LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -211,5 +211,5 @@ class CUDABestSplitFinder { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_ diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp index 271b9d97e5f0..3ad157ef0105 100644 --- a/src/treelearner/cuda/cuda_data_partition.cpp +++ b/src/treelearner/cuda/cuda_data_partition.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -370,4 +370,4 @@ void CUDADataPartition::ResetByLeafPred(const std::vector& leaf_pred, int n } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu index 4f88d3034acc..b1d3fa496ab9 100644 --- a/src/treelearner/cuda/cuda_data_partition.cu +++ b/src/treelearner/cuda/cuda_data_partition.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_data_partition.hpp" @@ -1071,4 +1071,4 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_valu } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp index 3f02977a615a..84050565c085 100644 --- a/src/treelearner/cuda/cuda_data_partition.hpp +++ b/src/treelearner/cuda/cuda_data_partition.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_ #define LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -384,5 +384,5 @@ class CUDADataPartition { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_ diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp index 83227165af19..7e6be1c1069c 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_histogram_constructor.hpp" @@ -193,4 +193,4 @@ void CUDAHistogramConstructor::ResetConfig(const Config* config) { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu index e1888f0c4b66..c884383304a4 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.cu +++ b/src/treelearner/cuda/cuda_histogram_constructor.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_histogram_constructor.hpp" @@ -429,4 +429,4 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel( } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp index e364003ed934..7e600e7c01b4 100644 --- a/src/treelearner/cuda/cuda_histogram_constructor.hpp +++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ #define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -165,5 +165,5 @@ class CUDAHistogramConstructor { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_ diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp index 9d093f0f164b..6aa020d9ea0d 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cpp +++ b/src/treelearner/cuda/cuda_leaf_splits.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_leaf_splits.hpp" @@ -68,4 +68,4 @@ void CUDALeafSplits::Resize(const data_size_t num_data) { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu index 15c2983ef1d2..29e42f67ead9 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.cu +++ b/src/treelearner/cuda/cuda_leaf_splits.cu @@ -5,7 +5,7 @@ */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_leaf_splits.hpp" #include @@ -126,4 +126,4 @@ void CUDALeafSplits::LaunchInitValuesKernal( } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp index fe04cf5bcace..769f956b95c3 100644 --- a/src/treelearner/cuda/cuda_leaf_splits.hpp +++ b/src/treelearner/cuda/cuda_leaf_splits.hpp @@ -6,7 +6,7 @@ #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_ #define LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include #include @@ -156,5 +156,5 @@ class CUDALeafSplits { } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_ diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp index f8e6fbfec725..28c5562aa01c 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_single_gpu_tree_learner.hpp" @@ -515,4 +515,4 @@ void CUDASingleGPUTreeLearner::CheckSplitValid( } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu index f4a87de499cb..8a558ddc43d1 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu @@ -4,7 +4,7 @@ * license information. */ -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include @@ -258,4 +258,4 @@ void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel( } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp index b1922f5f28c5..bfad5e81c952 100644 --- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp +++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp @@ -9,7 +9,7 @@ #include #include -#ifdef USE_CUDA_EXP +#ifdef USE_CUDA #include "cuda_leaf_splits.hpp" #include "cuda_histogram_constructor.hpp" @@ -137,7 +137,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { } // namespace LightGBM -#else // USE_CUDA_EXP +#else // USE_CUDA // When GPU support is not compiled in, quit with an error message @@ -147,12 +147,12 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner { public: #pragma warning(disable : 4702) explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) { - Log::Fatal("CUDA Tree Learner experimental version was not enabled in this build.\n" - "Please recompile with CMake option -DUSE_CUDA_EXP=1"); + Log::Fatal("CUDA Tree Learner was not enabled in this build.\n" + "Please recompile with CMake option -DUSE_CUDAP=1"); } }; } // namespace LightGBM -#endif // USE_CUDA_EXP +#endif // USE_CUDA #endif // LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_ diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h index 0409821850b1..14b78eb6a577 100644 --- a/src/treelearner/serial_tree_learner.h +++ b/src/treelearner/serial_tree_learner.h @@ -211,7 +211,7 @@ class SerialTreeLearner: public TreeLearner { std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */ std::vector> ordered_hessians_; -#elif defined(USE_CUDA) || defined(USE_CUDA_EXP) +#elif defined(USE_CUDA) /*! \brief gradients of current iteration, ordered for cache optimized */ std::vector> ordered_gradients_; /*! \brief hessians of current iteration, ordered for cache optimized */ diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index 40a29e33a531..e637660c1425 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -40,24 +40,14 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con return new VotingParallelTreeLearner(config); } } else if (device_type == std::string("cuda")) { - if (learner_type == std::string("serial")) { - return new CUDATreeLearner(config); - } else if (learner_type == std::string("feature")) { - return new FeatureParallelTreeLearner(config); - } else if (learner_type == std::string("data")) { - return new DataParallelTreeLearner(config); - } else if (learner_type == std::string("voting")) { - return new VotingParallelTreeLearner(config); - } - } else if (device_type == std::string("cuda_exp")) { if (learner_type == std::string("serial")) { if (config->num_gpu == 1) { return new CUDASingleGPUTreeLearner(config, boosting_on_cuda); } else { - Log::Fatal("cuda_exp only supports training on a single GPU."); + Log::Fatal("cuda only supports training on a single GPU."); } } else { - Log::Fatal("cuda_exp only supports training on a single machine."); + Log::Fatal("cuda only supports training on a single machine."); } } return nullptr; diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py index f3fc65d3e4c1..b3a00c319c90 100644 --- a/tests/python_package_test/test_basic.py +++ b/tests/python_package_test/test_basic.py @@ -48,7 +48,7 @@ def test_basic(tmp_path): assert bst.current_iteration() == 20 assert bst.num_trees() == 20 assert bst.num_model_per_iteration() == 1 - if getenv('TASK', '') != 'cuda_exp': + if getenv('TASK', '') != 'cuda': assert bst.lower_bound() == pytest.approx(-2.9040190126976606) assert bst.upper_bound() == pytest.approx(3.3182142872462883) diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py index 748824cef451..068fa4a6f388 100644 --- a/tests/python_package_test/test_dask.py +++ b/tests/python_package_test/test_dask.py @@ -57,7 +57,7 @@ pytestmark = [ pytest.mark.skipif(getenv('TASK', '') == 'mpi', reason='Fails to run with MPI interface'), - pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Fails to run with CUDA Experimental interface') + pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Fails to run with CUDA interface') ] diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py index 9f91ffcbeb46..4691120184aa 100644 --- a/tests/python_package_test/test_engine.py +++ b/tests/python_package_test/test_engine.py @@ -588,7 +588,7 @@ def test_multi_class_error(): assert results['training']['multi_error@2'][-1] == pytest.approx(0) -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') def test_auc_mu(): # should give same result as binary auc for 2 classes X, y = load_digits(n_class=10, return_X_y=True) @@ -1661,7 +1661,7 @@ def generate_trainset_for_monotone_constraints_tests(x3_to_category=True): return trainset -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Monotone constraints are not yet supported by CUDA version') @pytest.mark.parametrize("test_with_categorical_variable", [True, False]) def test_monotone_constraints(test_with_categorical_variable): def is_increasing(y): @@ -1751,7 +1751,7 @@ def has_interaction(treef): assert are_interactions_enforced(constrained_model, feature_sets) -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Monotone constraints are not yet supported by CUDA version') def test_monotone_penalty(): def are_first_splits_non_monotone(tree, n, monotone_constraints): if n <= 0: @@ -1791,7 +1791,7 @@ def are_there_monotone_splits(tree, monotone_constraints): # test if a penalty as high as the depth indeed prohibits all monotone splits -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Monotone constraints are not yet supported by CUDA version') def test_monotone_penalty_max(): max_depth = 5 monotone_constraints = [1, -1, 0] @@ -2652,7 +2652,7 @@ def test_model_size(): pytest.skipTest('not enough RAM') -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') def test_get_split_value_histogram(): X, y = make_synthetic_regression() X = np.repeat(X, 3, axis=0) @@ -2735,7 +2735,7 @@ def test_get_split_value_histogram(): gbm.get_split_value_histogram(2) -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') def test_early_stopping_for_only_first_metric(): def metrics_combination_train_regression(valid_sets, metric_list, assumed_iteration, @@ -3573,7 +3573,7 @@ def hook(obj): assert "LV" in dumped_model_str -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Forced splits are not yet supported by CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Forced splits are not yet supported by CUDA version') def test_force_split_with_feature_fraction(tmp_path): X, y = make_synthetic_regression() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py index c6d8a5bcaf41..746c958a7304 100644 --- a/tests/python_package_test/test_sklearn.py +++ b/tests/python_package_test/test_sklearn.py @@ -121,7 +121,7 @@ def test_regression(): assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret) -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') def test_multiclass(): X, y = load_digits(n_class=10, return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -134,7 +134,7 @@ def test_multiclass(): assert gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret) -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') def test_lambdarank(): rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank' X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train')) @@ -1091,7 +1091,7 @@ def test_nan_handle(): np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.nan) -@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version') +@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version') def test_first_metric_only(): def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only): diff --git a/tests/python_package_test/test_utilities.py b/tests/python_package_test/test_utilities.py index e75198bb1214..01c45884a578 100644 --- a/tests/python_package_test/test_utilities.py +++ b/tests/python_package_test/test_utilities.py @@ -91,15 +91,15 @@ def dummy_metric(_, __): "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.", "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!" ] - cuda_exp_lines = [ - "INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda_exp version. Fall back to evaluation on CPU.", - "INFO | [LightGBM] [Warning] Metric binary_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.", + cuda_ines = [ + "INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.", + "INFO | [LightGBM] [Warning] Metric binary_error is not implemented in cuda version. Fall back to evaluation on CPU.", ] with open(log_filename, "rt", encoding="utf-8") as f: actual_log = f.read().strip() actual_log_wo_gpu_stuff = [] for line in actual_log.split("\n"): - if not any(line.startswith(gpu_or_cuda_exp_line) for gpu_or_cuda_exp_line in gpu_lines + cuda_exp_lines): + if not any(line.startswith(gpu_or_cuda_line) for gpu_or_cuda_line in gpu_lines + cuda_lines): actual_log_wo_gpu_stuff.append(line) assert "\n".join(actual_log_wo_gpu_stuff) == expected_log From 01822059b8948f3c997d21f626faf2ddee0bb9a0 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Jan 2023 19:39:02 -0600 Subject: [PATCH 07/17] revert some unnecessaary changes --- include/LightGBM/cuda/vector_cudahost.h | 31 ++++++++++++++++--------- src/io/config.cpp | 3 +-- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 6262a3bc9113..45adfd67858b 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -42,35 +42,44 @@ struct CHAllocator { T* ptr; if (n == 0) return NULL; n = SIZE_ALIGNED(n); - if (LGBM_config_::current_device == lgbm_device_cuda) { + #ifdef USE_CUDA + if (LGBM_config_::current_device == lgbm_device_cuda) { cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable); if (ret != cudaSuccess) { - Log::Warning("Defaulting to malloc in CHAllocator!!!"); - ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); + Log::Warning("Defaulting to malloc in CHAllocator!!!"); + ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); } - } else { + } else { ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); - } + } + #else + ptr = reinterpret_cast(_mm_malloc(n*sizeof(T), 16)); + #endif return ptr; } void deallocate(T* p, std::size_t n) { (void)n; // UNUSED if (p == NULL) return; - if (LGBM_config_::current_device == lgbm_device_cuda) { + #ifdef USE_CUDA + if (LGBM_config_::current_device == lgbm_device_cuda) { cudaPointerAttributes attributes; cudaPointerGetAttributes(&attributes, p); #if CUDA_VERSION >= 10000 - if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { + if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { cudaFreeHost(p); - } + } #else - if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { + if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) { cudaFreeHost(p); - } + } #endif - } else { + } else { _mm_free(p); + } + #else + _mm_free(p); + #endif } } }; diff --git a/src/io/config.cpp b/src/io/config.cpp index ba2672fcaef4..2d891f769ee1 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -182,8 +182,7 @@ void GetDeviceType(const std::unordered_map& params, s "Found device_type='cuda_exp' passed through params. " "That is an alias for device_type='cuda'. " "Use device_type='cuda' to suppress this warning. " - "In the future, this warning will become an error. " - ); + "In the future, this warning will become an error. "); *device_type = "cuda"; } else { Log::Fatal("Unknown device type %s", value.c_str()); From 07a4a926142a3d95cc39165d9801284a065857e4 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Jan 2023 19:51:04 -0600 Subject: [PATCH 08/17] revert a few more mistakes --- include/LightGBM/cuda/vector_cudahost.h | 1 - src/io/dataset.cpp | 20 +++++++++++++------- src/io/dataset_loader.cpp | 10 +++++++--- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 45adfd67858b..8df88c1c48cf 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -80,7 +80,6 @@ struct CHAllocator { #else _mm_free(p); #endif - } } }; template diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 061bdd2464bd..551683a81197 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -858,12 +858,14 @@ void Dataset::CopySubrow(const Dataset* fullset, gpu_device_id_ = fullset->gpu_device_id_; #ifdef USE_CUDA - if (cuda_column_data_ == nullptr) { - cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_)); - metadata_.CreateCUDAMetadata(gpu_device_id_); + if (device_type_ == std::string("cuda")) { + if (cuda_column_data_ == nullptr) { + cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_)); + metadata_.CreateCUDAMetadata(gpu_device_id_); + } + cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); } - cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); - #endif // USE_CUDA + #endif // USE_CUDA_EXP } bool Dataset::SetFloatField(const char* field_name, const float* field_data, @@ -1502,8 +1504,12 @@ void Dataset::AddFeaturesFrom(Dataset* other) { } } #ifdef USE_CUDA - CreateCUDAColumnData(); - #endif // USE_CUDA + if (device_type_ == std::string("cuda")) { + CreateCUDAColumnData(); + } else { + cuda_column_data_ = nullptr; + } + #endif // USE_CUDA_EXP } const void* Dataset::GetColWiseData( diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index e384af9e0792..76d6fad794cc 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -280,9 +280,13 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac dataset->device_type_ = config_.device_type; dataset->gpu_device_id_ = config_.gpu_device_id; #ifdef USE_CUDA - dataset->CreateCUDAColumnData(); - dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_); - #endif // USE_CUDA + if (config_.device_type == std::string("cuda")) { + dataset->CreateCUDAColumnData(); + dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_); + } else { + dataset->cuda_column_data_ = nullptr; + } + #endif // USE_CUDA_EXP } // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); From 0c60b71726e17de75b6bc507fb669a19d44c437b Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Jan 2023 20:30:07 -0600 Subject: [PATCH 09/17] revert another change that ignored params --- src/io/dataset.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 551683a81197..68d01ec11920 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -447,9 +447,13 @@ void Dataset::FinishLoad() { metadata_.FinishLoad(); #ifdef USE_CUDA - CreateCUDAColumnData(); - metadata_.CreateCUDAMetadata(gpu_device_id_); - #endif // USE_CUDA + if (device_type_ == std::string("cuda")) { + CreateCUDAColumnData(); + metadata_.CreateCUDAMetadata(gpu_device_id_); + } else { + cuda_column_data_.reset(nullptr); + } + #endif // USE_CUDA_EXP is_finish_load_ = true; } From 118d32a4a8a69577db64ef304ce8a32ad1a06145 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Jan 2023 20:47:40 -0600 Subject: [PATCH 10/17] sigh --- src/boosting/gbdt.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index fbaf337eea60..b1db41621067 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -71,9 +71,11 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective if (config_->device_type == std::string("cuda")) { LGBM_config_::current_learner = use_cuda_learner; #ifdef USE_CUDA - const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; - CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); - #endif // USE_CUDA + if (config_->device_type == std::string("cuda")) { + const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; + CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); + } + #endif // USE_CUDA_EXP } // load forced_splits file From e4cc9d05b56472145bbe72b698ada6e1e0d0ddcc Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Jan 2023 21:55:32 -0600 Subject: [PATCH 11/17] remove CUDATreeLearner --- include/LightGBM/cuda/vector_cudahost.h | 4 ++-- src/treelearner/data_parallel_tree_learner.cpp | 1 - src/treelearner/feature_parallel_tree_learner.cpp | 1 - src/treelearner/parallel_tree_learner.h | 1 - src/treelearner/tree_learner.cpp | 1 - src/treelearner/voting_parallel_tree_learner.cpp | 1 - 6 files changed, 2 insertions(+), 7 deletions(-) diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h index 8df88c1c48cf..83fbe5cda9b7 100644 --- a/include/LightGBM/cuda/vector_cudahost.h +++ b/include/LightGBM/cuda/vector_cudahost.h @@ -5,11 +5,12 @@ #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ #define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ -#ifdef USE_CUDA #include +#ifdef USE_CUDA #include #include +#endif #include enum LGBM_Device { @@ -89,5 +90,4 @@ bool operator!=(const CHAllocator&, const CHAllocator&); } // namespace LightGBM -#endif // USE_CUDA #endif // LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_ diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp index c9ff36da9f44..677b7dc6eb82 100644 --- a/src/treelearner/data_parallel_tree_learner.cpp +++ b/src/treelearner/data_parallel_tree_learner.cpp @@ -276,7 +276,6 @@ void DataParallelTreeLearner::Split(Tree* tree, int best_Leaf, in } // instantiate template classes, otherwise linker cannot find the code -template class DataParallelTreeLearner; template class DataParallelTreeLearner; template class DataParallelTreeLearner; diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp index f4edfe03dc16..c5202f3d706d 100644 --- a/src/treelearner/feature_parallel_tree_learner.cpp +++ b/src/treelearner/feature_parallel_tree_learner.cpp @@ -77,7 +77,6 @@ void FeatureParallelTreeLearner::FindBestSplitsFromHistograms( } // instantiate template classes, otherwise linker cannot find the code -template class FeatureParallelTreeLearner; template class FeatureParallelTreeLearner; template class FeatureParallelTreeLearner; } // namespace LightGBM diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h index 03b60aca6329..29f4e1688b99 100644 --- a/src/treelearner/parallel_tree_learner.h +++ b/src/treelearner/parallel_tree_learner.h @@ -12,7 +12,6 @@ #include #include -#include "cuda_tree_learner.h" #include "gpu_tree_learner.h" #include "serial_tree_learner.h" diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index e637660c1425..aaa0ce72ecd9 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -4,7 +4,6 @@ */ #include -#include "cuda_tree_learner.h" #include "gpu_tree_learner.h" #include "linear_tree_learner.h" #include "parallel_tree_learner.h" diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp index aacd5caa4412..0eee032839cd 100644 --- a/src/treelearner/voting_parallel_tree_learner.cpp +++ b/src/treelearner/voting_parallel_tree_learner.cpp @@ -501,7 +501,6 @@ void VotingParallelTreeLearner::Split(Tree* tree, int best_Leaf, } // instantiate template classes, otherwise linker cannot find the code -template class VotingParallelTreeLearner; template class VotingParallelTreeLearner; template class VotingParallelTreeLearner; } // namespace LightGBM From e734d6f02a418b7e4b2ee261230418a9a64e39be Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Jan 2023 22:08:02 -0600 Subject: [PATCH 12/17] fix tests, docs --- .ci/test.sh | 114 ++++++++++---------- docs/Installation-Guide.rst | 4 +- python-package/README.rst | 6 +- python-package/setup.py | 2 +- src/boosting/gbdt.cpp | 2 +- src/io/dataset.cpp | 6 +- src/io/dataset_loader.cpp | 2 +- tests/python_package_test/test_utilities.py | 2 +- 8 files changed, 66 insertions(+), 72 deletions(-) diff --git a/.ci/test.sh b/.ci/test.sh index 12b4ae027993..a45cb7ae40ef 100755 --- a/.ci/test.sh +++ b/.ci/test.sh @@ -34,63 +34,7 @@ if [[ "$TASK" == "cpp-tests" ]]; then exit 0 fi -CONDA_PYTHON_REQUIREMENT="python=$PYTHON_VERSION[build=*cpython]" - -if [[ $TASK == "if-else" ]]; then - conda create -q -y -n $CONDA_ENV ${CONDA_PYTHON_REQUIREMENT} numpy - source activate $CONDA_ENV - mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1 - cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1 - cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1 - cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit -1 - exit 0 -fi - -if [[ $TASK == "swig" ]]; then - mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build - if [[ $OS_NAME == "macos" ]]; then - cmake -DUSE_SWIG=ON -DAPPLE_OUTPUT_DYLIB=ON .. - else - cmake -DUSE_SWIG=ON .. - fi - make -j4 || exit -1 - if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then - objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit -1 - objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit -1 - python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit -1 - fi - if [[ $PRODUCES_ARTIFACTS == "true" ]]; then - cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar - fi - exit 0 -fi - -if [[ $TASK == "lint" ]]; then - conda create -q -y -n $CONDA_ENV \ - ${CONDA_PYTHON_REQUIREMENT} \ - cmakelint \ - cpplint \ - isort \ - mypy \ - pycodestyle \ - pydocstyle \ - "r-lintr>=3.0" - source activate $CONDA_ENV - echo "Linting Python code" - pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1 - pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 - isort . --check-only || exit -1 - mypy --ignore-missing-imports python-package/ || true - echo "Linting R code" - Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 - echo "Linting C++ code" - cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1 - cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake") - cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1 - exit 0 -fi - -conda create -q -y -n $CONDA_ENV "${CONDA_PYTHON_REQUIREMENT}" +conda create -q -y -n $CONDA_ENV "python=$PYTHON_VERSION[build=*cpython]" source activate $CONDA_ENV cd $BUILD_DIRECTORY @@ -128,6 +72,60 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then exit 0 fi +if [[ $TASK == "lint" ]]; then + conda install -q -y -n $CONDA_ENV \ + cmakelint \ + cpplint \ + flake8 \ + isort \ + mypy \ + pydocstyle \ + "r-lintr>=3.0" + echo "Linting Python code" + flake8 \ + --ignore=E501,W503 \ + --exclude=./.nuget,./external_libs,./python-package/build \ + . || exit -1 + pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1 + isort . --check-only || exit -1 + mypy --ignore-missing-imports python-package/ || true + echo "Linting R code" + Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1 + echo "Linting C++ code" + cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1 + cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake") + cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1 + exit 0 +fi + +if [[ $TASK == "if-else" ]]; then + conda install -q -y -n $CONDA_ENV numpy + mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1 + cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1 + cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1 + cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit -1 + exit 0 +fi + +if [[ $TASK == "swig" ]]; then + mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build + if [[ $OS_NAME == "macos" ]]; then + cmake -DUSE_SWIG=ON -DAPPLE_OUTPUT_DYLIB=ON .. + else + cmake -DUSE_SWIG=ON .. + fi + make -j4 || exit -1 + if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then + objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit -1 + objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit -1 + python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit -1 + fi + if [[ $PRODUCES_ARTIFACTS == "true" ]]; then + cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar + fi + exit 0 +fi + # re-including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy conda install -q -y -n $CONDA_ENV \ cloudpickle \ @@ -139,7 +137,7 @@ conda install -q -y -n $CONDA_ENV \ pandas \ psutil \ pytest \ - ${CONDA_PYTHON_REQUIREMENT} \ + "python=$PYTHON_VERSION[build=*cpython]" \ python-graphviz \ scikit-learn \ scipy || exit -1 diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst index 6ce3f2f0c987..46dd0559074e 100644 --- a/docs/Installation-Guide.rst +++ b/docs/Installation-Guide.rst @@ -605,8 +605,8 @@ Docker Refer to `GPU Docker folder `__. -Build CUDA Version (Experimental) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Build CUDA Version +~~~~~~~~~~~~~~~~~~ The `original GPU build <#build-gpu-version>`__ of LightGBM (``device_type=gpu``) is based on OpenCL. diff --git a/python-package/README.rst b/python-package/README.rst index 8021add44c63..6cabf33ab29c 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -121,9 +121,7 @@ Build CUDA Version All requirements from `Build from Sources section <#build-from-sources>`__ apply for this installation option as well, and `CMake`_ (version 3.16 or higher) is strongly required. -**CUDA** library (version 9.0 or higher) is needed: details for installation can be found in `Installation Guide `__. - -Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``--cuda`` with ``--cuda-exp`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. Note that this new version uses twice the memory, since it stores data row-wise as well as column-wise in memory to improve performance (see this `issue `__ for discussion). +**CUDA** library (version 10.0 or higher) is needed: details for installation can be found in `Installation Guide `__. To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters. @@ -211,8 +209,6 @@ Run ``python setup.py install --gpu`` to enable GPU support. All requirements fr Run ``python setup.py install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well. -Run ``python setup.py install --cuda-exp`` to enable the new experimental version of CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well. - Run ``python setup.py install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well. Run ``python setup.py install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well. diff --git a/python-package/setup.py b/python-package/setup.py index af07bfbdd5b4..012dd0f8577b 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -21,7 +21,7 @@ ('integrated-opencl', None, 'Compile integrated OpenCL version'), ('gpu', 'g', 'Compile GPU version'), ('cuda', None, 'Compile CUDA version'), - ('cuda-exp', None, 'Compile CUDA Experimental version'), + ('cuda-exp', None, '(deprecated) Alias for 'cuda'. Use 'cuda' instead.'), ('mpi', None, 'Compile MPI version'), ('nomp', None, 'Compile version without OpenMP support'), ('hdfs', 'h', 'Compile HDFS version'), diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp index b1db41621067..1f0a5405bf49 100644 --- a/src/boosting/gbdt.cpp +++ b/src/boosting/gbdt.cpp @@ -75,7 +75,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0; CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id)); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } // load forced_splits file diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp index 68d01ec11920..de368d3036db 100644 --- a/src/io/dataset.cpp +++ b/src/io/dataset.cpp @@ -453,7 +453,7 @@ void Dataset::FinishLoad() { } else { cuda_column_data_.reset(nullptr); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA is_finish_load_ = true; } @@ -869,7 +869,7 @@ void Dataset::CopySubrow(const Dataset* fullset, } cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices); } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } bool Dataset::SetFloatField(const char* field_name, const float* field_data, @@ -1513,7 +1513,7 @@ void Dataset::AddFeaturesFrom(Dataset* other) { } else { cuda_column_data_ = nullptr; } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } const void* Dataset::GetColWiseData( diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp index 76d6fad794cc..621176db2c59 100644 --- a/src/io/dataset_loader.cpp +++ b/src/io/dataset_loader.cpp @@ -286,7 +286,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac } else { dataset->cuda_column_data_ = nullptr; } - #endif // USE_CUDA_EXP + #endif // USE_CUDA } // check meta data dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices); diff --git a/tests/python_package_test/test_utilities.py b/tests/python_package_test/test_utilities.py index 01c45884a578..cfd5b133b6e2 100644 --- a/tests/python_package_test/test_utilities.py +++ b/tests/python_package_test/test_utilities.py @@ -91,7 +91,7 @@ def dummy_metric(_, __): "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.", "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!" ] - cuda_ines = [ + cuda_lines = [ "INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.", "INFO | [LightGBM] [Warning] Metric binary_error is not implemented in cuda version. Fall back to evaluation on CPU.", ] From 0b4df93323c898abd60ac392202a78ca19c495a1 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Jan 2023 22:17:07 -0600 Subject: [PATCH 13/17] fix quoting in setup.py --- python-package/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/setup.py b/python-package/setup.py index 012dd0f8577b..617d3b33d6ef 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -21,7 +21,7 @@ ('integrated-opencl', None, 'Compile integrated OpenCL version'), ('gpu', 'g', 'Compile GPU version'), ('cuda', None, 'Compile CUDA version'), - ('cuda-exp', None, '(deprecated) Alias for 'cuda'. Use 'cuda' instead.'), + ('cuda-exp', None, '(deprecated) Alias for "cuda". Use "cuda" instead.'), ('mpi', None, 'Compile MPI version'), ('nomp', None, 'Compile version without OpenMP support'), ('hdfs', 'h', 'Compile HDFS version'), From 967c005f7c8a2ea2cc8a2049c19c3e6a94eb68b8 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Tue, 17 Jan 2023 23:07:21 -0600 Subject: [PATCH 14/17] restore all CI --- .appveyor.yml | 44 ++++ .github/workflows/r_package.yml | 350 ++++++++++++++++++++++++++++++++ .vsts-ci.yml | 138 ++++++------- 3 files changed, 463 insertions(+), 69 deletions(-) create mode 100644 .appveyor.yml create mode 100644 .github/workflows/r_package.yml diff --git a/.appveyor.yml b/.appveyor.yml new file mode 100644 index 000000000000..2d279b0f33e3 --- /dev/null +++ b/.appveyor.yml @@ -0,0 +1,44 @@ +version: 3.3.3.99.{build} + +image: Visual Studio 2015 +platform: x64 +configuration: # a trick to construct a build matrix with multiple Python versions + - '3.7' + +# only build pull requests and +# commits to 'master' or any branch starting with 'release' +branches: + only: + - master + - /^release/ + +environment: + matrix: + - COMPILER: MSVC + TASK: python + - COMPILER: MINGW + TASK: python + +clone_depth: 5 + +install: + - git submodule update --init --recursive # get `external_libs` folder + - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% # delete sh.exe from PATH (mingw32-make fix) + - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH% + - set PYTHON_VERSION=%CONFIGURATION% + - set CONDA_ENV="test-env" + - ps: | + $env:MINICONDA = "C:\Miniconda3-x64" + $env:PATH = "$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH" + $env:BUILD_SOURCESDIRECTORY = "$env:APPVEYOR_BUILD_FOLDER" + $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim() + +build: false + +test_script: + - conda config --remove channels defaults + - conda config --add channels nodefaults + - conda config --add channels conda-forge + - conda config --set channel_priority strict + - conda init powershell + - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1 diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml new file mode 100644 index 000000000000..1574786cac6f --- /dev/null +++ b/.github/workflows/r_package.yml @@ -0,0 +1,350 @@ +name: R-package + +on: + push: + branches: + - master + pull_request: + branches: + - master + - release/* + +# automatically cancel in-progress builds if another commit is pushed +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +env: + # hack to get around this: + # https://stat.ethz.ch/pipermail/r-package-devel/2020q3/005930.html + _R_CHECK_SYSTEM_CLOCK_: 0 + # ignore R CMD CHECK NOTE checking how long it has + # been since the last submission + _R_CHECK_CRAN_INCOMING_REMOTE_: 0 + # CRAN ignores the "installed size is too large" NOTE, + # so our CI can too. Setting to a large value here just + # to catch extreme problems + _R_CHECK_PKG_SIZES_THRESHOLD_: 100 + +jobs: + test: + name: ${{ matrix.task }} (${{ matrix.os }}, ${{ matrix.compiler }}, R ${{ matrix.r_version }}, ${{ matrix.build_type }}) + runs-on: ${{ matrix.os }} + container: ${{ matrix.container }} + timeout-minutes: 60 + strategy: + fail-fast: false + matrix: + include: + ################ + # CMake builds # + ################ + - os: ubuntu-latest + task: r-package + compiler: gcc + r_version: 3.6 + build_type: cmake + container: 'ubuntu:18.04' + - os: ubuntu-latest + task: r-package + compiler: gcc + r_version: 4.2 + build_type: cmake + container: 'ubuntu:22.04' + - os: ubuntu-latest + task: r-package + compiler: clang + r_version: 3.6 + build_type: cmake + container: 'ubuntu:18.04' + - os: ubuntu-latest + task: r-package + compiler: clang + r_version: 4.2 + build_type: cmake + container: 'ubuntu:22.04' + - os: macOS-latest + task: r-package + compiler: gcc + r_version: 3.6 + build_type: cmake + container: null + - os: macOS-latest + task: r-package + compiler: gcc + r_version: 4.2 + build_type: cmake + container: null + - os: macOS-latest + task: r-package + compiler: clang + r_version: 3.6 + build_type: cmake + container: null + - os: macOS-latest + task: r-package + compiler: clang + r_version: 4.2 + build_type: cmake + container: null + - os: windows-latest + task: r-package + compiler: MINGW + toolchain: MINGW + r_version: 3.6 + build_type: cmake + container: null + - os: windows-latest + task: r-package + compiler: MINGW + toolchain: MSYS + r_version: 4.2 + build_type: cmake + container: null + # Visual Studio 2019 + - os: windows-2019 + task: r-package + compiler: MSVC + toolchain: MSVC + r_version: 3.6 + build_type: cmake + container: null + # Visual Studio 2022 + - os: windows-2022 + task: r-package + compiler: MSVC + toolchain: MSVC + r_version: 4.2 + build_type: cmake + container: null + ############### + # CRAN builds # + ############### + - os: windows-latest + task: r-package + compiler: MINGW + toolchain: MINGW + r_version: 3.6 + build_type: cran + container: null + - os: windows-latest + task: r-package + compiler: MINGW + toolchain: MSYS + r_version: 4.2 + build_type: cran + container: null + - os: ubuntu-latest + task: r-package + compiler: gcc + r_version: 4.2 + build_type: cran + container: 'ubuntu:22.04' + - os: macOS-latest + task: r-package + compiler: clang + r_version: 4.2 + build_type: cran + container: null + ################ + # Other checks # + ################ + - os: ubuntu-latest + task: r-rchk + compiler: gcc + r_version: 4.2 + build_type: cran + container: 'ubuntu:22.04' + steps: + - name: Prevent conversion of line endings on Windows + if: startsWith(matrix.os, 'windows') + shell: pwsh + run: git config --global core.autocrlf false + - name: Install packages used by third-party actions + if: startsWith(matrix.os, 'ubuntu') + shell: bash + run: | + apt-get update -y + apt-get install --no-install-recommends -y \ + ca-certificates \ + dirmngr \ + gpg \ + gpg-agent \ + software-properties-common \ + sudo + # install newest version of git + # ref: + # - https://unix.stackexchange.com/a/170831/550004 + # - https://git-scm.com/download/linux + add-apt-repository ppa:git-core/ppa -y + apt-get update -y + apt-get install --no-install-recommends -y \ + git + - name: Trust git cloning LightGBM + if: startsWith(matrix.os, 'ubuntu') + run: | + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 5 + submodules: true + - name: Install pandoc + uses: r-lib/actions/setup-pandoc@v2 + - name: install tinytex + if: startsWith(matrix.os, 'windows') + uses: r-lib/actions/setup-tinytex@v2 + env: + CTAN_MIRROR: https://ctan.math.illinois.edu/systems/win32/miktex + TINYTEX_INSTALLER: TinyTeX + - name: Setup and run tests on Linux and macOS + if: matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest' + shell: bash + run: | + export TASK="${{ matrix.task }}" + export COMPILER="${{ matrix.compiler }}" + export GITHUB_ACTIONS="true" + if [[ "${{ matrix.os }}" == "macOS-latest" ]]; then + export OS_NAME="macos" + elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then + export OS_NAME="linux" + export IN_UBUNTU_BASE_CONTAINER="true" + # the default version of cmake provided on Ubuntu 18.04 (v3.10.2), is not supported by LightGBM + # see https://github.com/microsoft/LightGBM/issues/5642 + if [[ "${{ matrix.container }}" == "ubuntu:18.04" ]]; then + export INSTALL_CMAKE_FROM_RELEASES="true" + fi + fi + export BUILD_DIRECTORY="$GITHUB_WORKSPACE" + export R_VERSION="${{ matrix.r_version }}" + export R_BUILD_TYPE="${{ matrix.build_type }}" + $GITHUB_WORKSPACE/.ci/setup.sh + $GITHUB_WORKSPACE/.ci/test.sh + - name: Setup and run tests on Windows + if: startsWith(matrix.os, 'windows') + shell: pwsh -command ". {0}" + run: | + $env:BUILD_SOURCESDIRECTORY = $env:GITHUB_WORKSPACE + $env:LGB_VER = (Get-Content -TotalCount 1 $env:BUILD_SOURCESDIRECTORY\VERSION.txt).trim().replace('rc', '-') + $env:TOOLCHAIN = "${{ matrix.toolchain }}" + $env:R_VERSION = "${{ matrix.r_version }}" + $env:R_BUILD_TYPE = "${{ matrix.build_type }}" + $env:COMPILER = "${{ matrix.compiler }}" + $env:GITHUB_ACTIONS = "true" + $env:TASK = "${{ matrix.task }}" + & "$env:GITHUB_WORKSPACE/.ci/test_windows.ps1" + test-r-sanitizers: + name: r-sanitizers (ubuntu-latest, R-devel, ${{ matrix.compiler }} ASAN/UBSAN) + timeout-minutes: 60 + runs-on: ubuntu-latest + container: wch1/r-debug + strategy: + fail-fast: false + matrix: + include: + - r_customization: san + compiler: gcc + - r_customization: csan + compiler: clang + steps: + - name: Trust git cloning LightGBM + run: | + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 5 + submodules: true + - name: Install packages + shell: bash + run: | + RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" + sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }} + RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1 + - name: Run tests with sanitizers + shell: bash + run: | + cd R-package/tests + exit_code=0 + RDscript${{ matrix.r_customization }} testthat.R >> tests.log 2>&1 || exit_code=-1 + cat ./tests.log + exit ${exit_code} + test-r-debian-clang: + name: r-package (debian, R-devel, clang) + timeout-minutes: 60 + runs-on: ubuntu-latest + container: rhub/debian-clang-devel + steps: + - name: Install Git before checkout + shell: bash + run: | + apt-get update --allow-releaseinfo-change + apt-get install --no-install-recommends -y git + - name: Trust git cloning LightGBM + run: | + git config --global --add safe.directory "${GITHUB_WORKSPACE}" + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 5 + submodules: true + - name: update to clang 15 + shell: bash + run: | + # remove clang stuff that comes installed in the image + apt-get autoremove -y --purge \ + clang-* \ + libclang-* \ + libunwind-* \ + llvm-* + # + # replace it all with clang-15 + apt-get update -y + apt-get install --no-install-recommends -y \ + gnupg \ + lsb-release \ + software-properties-common \ + wget + # + wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - + # + add-apt-repository "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-15 main" + apt-get install -y --no-install-recommends \ + clang-15 \ + clangd-15 \ + clang-format-15 \ + clang-tidy-15 \ + clang-tools-15 \ + lldb-15 \ + lld-15 \ + llvm-15-dev \ + llvm-15-tools \ + libomp-15-dev \ + libc++-15-dev \ + libc++abi-15-dev \ + libclang-common-15-dev \ + libclang-15-dev \ + libclang-cpp15-dev \ + libunwind-15-dev + # overwrite everything in /usr/bin with the new v15 versions + cp --remove-destination /usr/lib/llvm-15/bin/* /usr/bin/ + - name: Install packages and run tests + shell: bash + run: | + export PATH=/opt/R-devel/bin/:${PATH} + Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" + sh build-cran-package.sh + R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1 + if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then + echo "NOTEs, WARNINGs, or ERRORs have been found by R CMD check" + exit -1 + fi + all-r-package-jobs-successful: + if: always() + runs-on: ubuntu-latest + needs: [test, test-r-sanitizers, test-r-debian-clang] + steps: + - name: Note that all tests succeeded + uses: re-actors/alls-green@v1.2.2 + with: + jobs: ${{ toJSON(needs) }} diff --git a/.vsts-ci.yml b/.vsts-ci.yml index 26c687b0a20f..194aa5471131 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -143,74 +143,74 @@ jobs: displayName: Setup - bash: $(Build.SourcesDirectory)/.ci/test.sh displayName: Test -# ########################################### -# - job: QEMU_multiarch -# ########################################### -# variables: -# COMPILER: gcc -# OS_NAME: 'linux' -# PRODUCES_ARTIFACTS: 'true' -# pool: -# vmImage: ubuntu-22.04 -# timeoutInMinutes: 180 -# strategy: -# matrix: -# bdist: -# TASK: bdist -# ARCH: aarch64 -# steps: -# - script: | -# sudo apt-get update -# sudo apt-get install --no-install-recommends -y \ -# binfmt-support \ -# qemu \ -# qemu-user \ -# qemu-user-static -# displayName: 'Install QEMU' -# - script: | -# docker run --rm --privileged multiarch/qemu-user-static --reset -p yes -# displayName: 'Enable Docker multi-architecture support' -# - script: | -# export ROOT_DOCKER_FOLDER=/LightGBM -# cat > docker.env < docker-script.sh < docker.env < docker-script.sh < Date: Sun, 29 Jan 2023 20:46:30 -0600 Subject: [PATCH 15/17] Apply suggestions from code review Co-authored-by: shiyu1994 --- src/treelearner/tree_learner.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp index aaa0ce72ecd9..0018bdaf8b94 100644 --- a/src/treelearner/tree_learner.cpp +++ b/src/treelearner/tree_learner.cpp @@ -43,10 +43,10 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con if (config->num_gpu == 1) { return new CUDASingleGPUTreeLearner(config, boosting_on_cuda); } else { - Log::Fatal("cuda only supports training on a single GPU."); + Log::Fatal("Currently cuda version only supports training on a single GPU."); } } else { - Log::Fatal("cuda only supports training on a single machine."); + Log::Fatal("Currently cuda version only supports training on a single machine."); } } return nullptr; From b111ab62fec11e6bf237f12dade6a5947c7bf693 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 29 Jan 2023 20:47:36 -0600 Subject: [PATCH 16/17] Apply suggestions from code review --- src/boosting/gbdt.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h index 1f784b94e593..288c345a4f1f 100644 --- a/src/boosting/gbdt.h +++ b/src/boosting/gbdt.h @@ -557,16 +557,16 @@ class GBDT : public GBDTBase { score_t* gradients_pointer_; /*! \brief Pointer to hessian vector, can be on CPU or GPU */ score_t* hessians_pointer_; - /*! \brief Whether boosting is done on GPU, used for cuda */ + /*! \brief Whether boosting is done on GPU, used for device_type=cuda */ bool boosting_on_gpu_; #ifdef USE_CUDA /*! \brief Gradient vector on GPU */ CUDAVector cuda_gradients_; /*! \brief Hessian vector on GPU */ CUDAVector cuda_hessians_; - /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda */ + /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with device_type=cuda */ mutable std::vector host_score_; - /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda */ + /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with device_type=cuda */ mutable CUDAVector cuda_score_; #endif // USE_CUDA From ac7ab774d5152478aa660028c2ee0d866a008daa Mon Sep 17 00:00:00 2001 From: James Lamb Date: Sun, 29 Jan 2023 21:06:05 -0600 Subject: [PATCH 17/17] completely remove cuda_exp, update docs --- .ci/setup.sh | 2 +- CMakeLists.txt | 6 ------ docs/Parameters.rst | 8 +++++++- include/LightGBM/config.h | 5 ++++- python-package/setup.py | 10 +--------- src/io/config.cpp | 7 ------- 6 files changed, 13 insertions(+), 25 deletions(-) diff --git a/.ci/setup.sh b/.ci/setup.sh index 89fbed442ed1..f7da21286d7d 100755 --- a/.ci/setup.sh +++ b/.ci/setup.sh @@ -106,7 +106,7 @@ else # Linux || exit -1 fi fi - if [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then + if [[ $TASK == "cuda" ]]; then echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections apt-get update apt-get install --no-install-recommends -y \ diff --git a/CMakeLists.txt b/CMakeLists.txt index 8d005662f37a..750b41ab8164 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,6 @@ option(USE_SWIG "Enable SWIG to generate Java API" OFF) option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) option(USE_TIMETAG "Set to ON to output time costs" OFF) option(USE_CUDA "Enable CUDA-accelerated training " OFF) -option(USE_CUDA_EXP "(DEPRECATED) Alias for 'USE_CUDA'. Use 'USE_CUDA' instead." OFF) option(USE_DEBUG "Set to ON for Debug mode" OFF) option(USE_SANITIZER "Use santizer flags" OFF) set( @@ -137,11 +136,6 @@ else() add_definitions(-DUSE_SOCKET) endif() -if(USE_CUDA_EXP) - message(WARNING "Option -DUSE_CUDA_EXP=ON is deprecated. Use -DUSE_CUDA=ON instead.") - set(USE_CUDA ON CACHE BOOL "Building CUDA-enabled version" FORCE) -endif() - if(USE_CUDA) set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}") enable_language(CUDA) diff --git a/docs/Parameters.rst b/docs/Parameters.rst index d2cb9acdaf6d..45f0837d7c89 100644 --- a/docs/Parameters.rst +++ b/docs/Parameters.rst @@ -207,7 +207,13 @@ Core Parameters - ``device_type`` :raw-html:`🔗︎`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, aliases: ``device`` - - device for the tree learning, you can use GPU to achieve the faster learning + - device for the tree learning + + - ``cpu`` supports all LightGBM functionality and is portable across the widest range of operating systems and hardware + + - ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA + + - ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA - **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h index c2de146b3854..b2a7112d612e 100644 --- a/include/LightGBM/config.h +++ b/include/LightGBM/config.h @@ -225,7 +225,10 @@ struct Config { // type = enum // options = cpu, gpu, cuda // alias = device - // desc = device for the tree learning, you can use GPU to achieve the faster learning + // desc = device for the tree learning + // desc = ``cpu`` supports all LightGBM functionality and is portable across the widest range of operating systems and hardware + // desc = ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA + // desc = ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support diff --git a/python-package/setup.py b/python-package/setup.py index 617d3b33d6ef..243dd5cb0af5 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -21,7 +21,6 @@ ('integrated-opencl', None, 'Compile integrated OpenCL version'), ('gpu', 'g', 'Compile GPU version'), ('cuda', None, 'Compile CUDA version'), - ('cuda-exp', None, '(deprecated) Alias for "cuda". Use "cuda" instead.'), ('mpi', None, 'Compile MPI version'), ('nomp', None, 'Compile version without OpenMP support'), ('hdfs', 'h', 'Compile HDFS version'), @@ -106,7 +105,6 @@ def compile_cpp( use_mingw: bool = False, use_gpu: bool = False, use_cuda: bool = False, - use_cuda_exp: bool = False, use_mpi: bool = False, use_hdfs: bool = False, boost_root: Optional[str] = None, @@ -128,9 +126,6 @@ def compile_cpp( logger.info("Starting to compile the library.") - if use_cuda_exp: - use_cuda = True - cmake_cmd = ["cmake", str(CURRENT_DIR / "compile")] if integrated_opencl: use_gpu = False @@ -236,7 +231,6 @@ def initialize_options(self) -> None: self.integrated_opencl = False self.gpu = False self.cuda = False - self.cuda_exp = False self.boost_root = None self.boost_dir = None self.boost_include_dir = None @@ -261,7 +255,7 @@ def run(self) -> None: LOG_PATH.touch() if not self.precompile: copy_files(integrated_opencl=self.integrated_opencl, use_gpu=self.gpu) - compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_cuda_exp=self.cuda_exp, use_mpi=self.mpi, + compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi, use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir, boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir, opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library, @@ -282,7 +276,6 @@ def initialize_options(self) -> None: self.integrated_opencl = False self.gpu = False self.cuda = False - self.cuda_exp = False self.boost_root = None self.boost_dir = None self.boost_include_dir = None @@ -305,7 +298,6 @@ def finalize_options(self) -> None: install.integrated_opencl = self.integrated_opencl install.gpu = self.gpu install.cuda = self.cuda - install.cuda_exp = self.cuda_exp install.boost_root = self.boost_root install.boost_dir = self.boost_dir install.boost_include_dir = self.boost_include_dir diff --git a/src/io/config.cpp b/src/io/config.cpp index 2d891f769ee1..86b64a52d105 100644 --- a/src/io/config.cpp +++ b/src/io/config.cpp @@ -177,13 +177,6 @@ void GetDeviceType(const std::unordered_map& params, s *device_type = "gpu"; } else if (value == std::string("cuda")) { *device_type = "cuda"; - } else if (value == std::string("cuda_exp")) { - Log::Warning( - "Found device_type='cuda_exp' passed through params. " - "That is an alias for device_type='cuda'. " - "Use device_type='cuda' to suppress this warning. " - "In the future, this warning will become an error. "); - *device_type = "cuda"; } else { Log::Fatal("Unknown device type %s", value.c_str()); }